diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..640a569273986e9b893876dc29671fd5afd38aa6 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,35 +1,40 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text *.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text *.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk0/graph.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk0/metaneff.pb filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk1/graph.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk1/metaneff.pb filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk2/graph.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk2/metaneff.pb filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk3/graph.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk3/metaneff.pb filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.neff filter=lfs diff=lfs merge=lfs -text +layout_opt/graph.neff filter=lfs diff=lfs merge=lfs -text +layout_opt/model/graph.hlo filter=lfs diff=lfs merge=lfs -text +model.pt filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk0/graph.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk0/metaneff.pb filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.hlo_module.pb filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk0/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk1/graph.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk1/metaneff.pb filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.hlo_module.pb filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk2/graph.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk2/metaneff.pb filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.hlo_module.pb filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk3/graph.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk3/metaneff.pb filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.hlo_module.pb filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.neff filter=lfs diff=lfs merge=lfs -text diff --git a/context_encoding_model/_tp0_bk0/command.txt b/context_encoding_model/_tp0_bk0/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1fa26e38ebe8c4e9b4475cdeb32c08f70a82ad7 --- /dev/null +++ b/context_encoding_model/_tp0_bk0/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb --output model.MODULE_f4171003694760566af4+a9cd68fb.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk0/compile_flags.MODULE_f4171003694760566af4+a9cd68fb.json b/context_encoding_model/_tp0_bk0/compile_flags.MODULE_f4171003694760566af4+a9cd68fb.json new file mode 100644 index 0000000000000000000000000000000000000000..2e9e047c5e4096feba23f45fda3361e6ea3d35c8 --- /dev/null +++ b/context_encoding_model/_tp0_bk0/compile_flags.MODULE_f4171003694760566af4+a9cd68fb.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=1", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk0/global_metric_store.json b/context_encoding_model/_tp0_bk0/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..7464480a0a933e2fa4fbf39c1ded20a33d2c1798 --- /dev/null +++ b/context_encoding_model/_tp0_bk0/global_metric_store.json @@ -0,0 +1,1079 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 99.65389251708984, + "StaticProfiler::AveragePartitionUtilization": 97.55139923095703, + "StaticProfiler::AveragePeUtilization": 98.60253143310547, + "StaticProfiler::LocalizationEfficiency": 99.04553985595703, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 100.20111846923828, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1.0, + "StaticProfiler::AveragePartitionUtilization": 1.0, + "StaticProfiler::AveragePeUtilization": 1.0, + "StaticProfiler::LocalizationEfficiency": 1.0, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 0.018787622451782227, + "AffinePredicateResolution": 0.0011818408966064453, + "AliasDependencyElimination": 0.00011801719665527344, + "AliasDependencyInduction": 0.005483388900756836, + "AliasDependencyReset": 0.026019811630249023, + "BFComputeCutting": 0.00225830078125, + "BirCodeGenLoop": 0.4621126651763916, + "CCOpFusion": 0.01928091049194336, + "CanonicalizeConv": 3.7000001611886546e-05, + "CanonicalizeDAGForPGTiling": 0.004612922668457031, + "CanonicalizeForTensorizer": 4.099999932805076e-05, + "CanonicalizeIR": 0.0017774105072021484, + "Canonicalizer": 0.0009619999909773469, + "CoalesceCCOp": 0.0146026611328125, + "CommuteConcat": 0.0020241737365722656, + "DMALocalityOpt": 0.005425214767456055, + "DMAProfiler": 0.012541055679321289, + "DMATilingProfiler": 0.004782676696777344, + "DataLocalityOpt": 0.06629562377929688, + "DataStreaming": 0.03773355484008789, + "DeConcat": 0.0006563663482666016, + "DeadCodeElimination": 0.002358675003051758, + "DeadStoreElimination": 0.0055620670318603516, + "DelinearIndices": 0.004741668701171875, + "Delinearization": 0.0036110877990722656, + "DoNothing": 0.00022459030151367188, + "DramToDramTranspose": 0.016016721725463867, + "DumpGraphAndMetadata": 0.0853111743927002, + "EliminateDivs": 0.0025675296783447266, + "ExpandBatchNorm": 0.002092123031616211, + "ExpandISAMacro": 0.011052370071411133, + "FactorizeBlkDims": 0.00814366340637207, + "FactorizeThreadAxesInFreeDims": 0.002122640609741211, + "FlattenMacroLoop": 0.002187013626098633, + "GenericAccessSimplifier": 0.0009529590606689453, + "HoistCompute": 6.000000212225132e-06, + "IdentifyCrossPassTensors": 7.700000423938036e-05, + "InferInitValue": 0.0242159366607666, + "InferIntrinsicOnCC": 0.009269952774047852, + "InferNeuronTensor": 0.020155906677246094, + "InferNonlocalTensors": 0.015646696090698242, + "InferPSumTensor": 0.3081786632537842, + "InlineNativeKernels": 0.009155511856079102, + "InsertIOTransposes": 0.015281438827514648, + "InsertLocalTransposes": 0.006501436233520508, + "InsertOffloadedTransposes": 0.002702474594116211, + "LICM": 0.002913951873779297, + "LateLegalizeInst": 0.014158487319946289, + "LateLegalizePostSplit": 0.012693405151367188, + "LateLowerReshapeOp": 0.0025734901428222656, + "LateLowerTensorOp": 0.001531362533569336, + "LateNeuronInstComb": 0.008838176727294922, + "LayoutPreprocessing": 0.026634931564331055, + "LayoutPreprocessingAndAnalysis": 0.5595176219940186, + "LayoutRequirementAnalysis": 0.005538463592529297, + "LegalizeCCOpLayout": 0.0022728443145751953, + "LegalizeOpLevelAlias": 0.001255035400390625, + "LegalizePartitionReduce": 0.001256704330444336, + "LegalizeSundaAccess": 0.07711672782897949, + "LegalizeSundaMacro": 0.010920286178588867, + "LegalizeType": 0.01314401626586914, + "LocalLayoutOpt": 0.012011289596557617, + "LoopFusion": 0.006572723388671875, + "LoopSplitting": 0.0003001689910888672, + "LowerBroadcast": 0.0018808841705322266, + "LowerCCOpBlockAxis": 0.0050678253173828125, + "LowerComplexBroadcast": 0.0025262832641601563, + "LowerIntrinsics": 0.3039369583129883, + "LowerTensorOp": 0.011744022369384766, + "LowerTranspose": 0.011741399765014648, + "MacroGeneration": 0.026911020278930664, + "MaskPropagation": 0.0031325817108154297, + "MemcastMotion": 2.2000000171829015e-05, + "MemcpyElimination": 0.027472257614135742, + "MutateDataType": 0.0015196800231933594, + "NeuronAliasDependencyInduction": 0.00016927719116210938, + "NeuronAliasDependencyReset": 0.0242006778717041, + "NeuronInstComb": 0.00468754768371582, + "NeuronLICM": 0.03664875030517578, + "NeuronLoopFusion": 0.00889277458190918, + "NeuronLoopInterchange": 0.002141237258911133, + "NeuronSimplifier": 0.00720524787902832, + "NeuronSimplifyPredicates": 0.12209796905517578, + "NeuronValueNumbering": 0.003449678421020508, + "OptimizeAliasedCopyChain": 0.0006387233734130859, + "OptimizeNKIKernels": 0.5260024070739746, + "PAGLayoutOpt": 0.5680239200592041, + "PComputeCutting": 0.0048143863677978516, + "PGLayoutTilingPipeline": 1.6304676532745361, + "PGTiling": 0.1616363525390625, + "PadElimination": 0.0003521442413330078, + "ParAxesAnnotation": 0.0544736385345459, + "PartialLoopFusion": 0.005907773971557617, + "PartialSimdFusion": 0.0038967132568359375, + "PenguinizeFunctions": 3.900000228895806e-05, + "PerfectLoopNest": 0.0021576881408691406, + "PruneFunctions": 3.5000000934815034e-05, + "RecognizeOpIdiom": 0.0039520263671875, + "Recompute": 0.0002884864807128906, + "RelaxPredicates": 0.013870716094970703, + "Rematerialization": 0.0024657249450683594, + "RemoveOptimizationBarriers": 6.500000017695129e-05, + "ReshapeWeights": 0.0006930828094482422, + "ResolveAccessConflict": 0.0038983821868896484, + "ResolveComplicatePredicates": 0.0012950897216796875, + "RewriteReplicationMatmul": 0.002060413360595703, + "RewriteWeights": 0.0028791427612304688, + "SFKVectorizer": 0.2904393672943115, + "ScatterMotion": 2.8000000384054147e-05, + "SimpleAllReduceTiling": 0.008909463882446289, + "Simplifier": 0.003449678421020508, + "SimplifyMacroPredicates": 0.010317325592041016, + "SimplifyNeuronTensor": 1.038323163986206, + "SimplifySlice": 0.0008852481842041016, + "SimplifyTensor": 0.005218982696533203, + "SpillPSum": 0.010073423385620117, + "SplitAPUnionSets": 0.10591006278991699, + "SplitAccGrp": 0.0011169910430908203, + "StaticProfiler": 0.01290583610534668, + "StaticTransposeLocalTensor": 0.003824472427368164, + "SundaISel": 0.041872262954711914, + "TCTransform": 0.0008666515350341797, + "TensorInitialization": 0.013058185577392578, + "TensorOpSimplifier": 0.0061550140380859375, + "TensorOpTransform": 0.020328521728515625, + "TensorizerLegalizationPass": 6.900000153109431e-05, + "TileCCOps": 0.006834983825683594, + "TilingProfiler": 0.0072863101959228516, + "TransformConvOp": 0.0032320022583007813, + "TritiumFusion": 0.03062152862548828, + "ValueNumbering": 0.0023603439331054688, + "VectorizeDMA": 0.004430294036865234, + "VectorizeMatMult": 0.0021605491638183594, + "VerifySupportedOps": 3.300000025774352e-05, + "WeightCoalescing": 0.00846409797668457, + "ZeroSizeTensorElimination": 0.00011014938354492188, + "algsimp": 0.004399999976158142, + "batchnorm_expander": 3.600000127335079e-05, + "boundary-marker-removal": 1.2000000424450263e-05, + "call-inliner": 0.0007670000777579844, + "canonicalize-boundary-marker": 1.4999999621068127e-05, + "collective-stream-id-checker": 7.300000288523734e-05, + "comparison-expander": 0.0006099999882280827, + "computation-deduplicator": 5.8999998145736754e-05, + "conditional-to-select": 1.5999999959603883e-05, + "config-lowering": 8.70000003487803e-05, + "constant-statistics": 0.0005649999948218465, + "constant_folding": 0.0005520000122487545, + "cse": 3.600000127335079e-05, + "dce": 0.00014599999121855944, + "dot_decomposer": 0.0013859999598935246, + "dynamic-slice-transpose": 1.2000000424450263e-05, + "eliminate-redundant-compare": 0.0004949999856762588, + "emit-offloaded-dropout": 3.80000019504223e-05, + "flatten-call-graph": 0.0009339999523945153, + "fuse-send-recv": 7.100000220816582e-05, + "hilo::LegalizeAlias": 1.1999999514955562e-05, + "hilo::NeuronInstCombine": 0.00010099999781232327, + "hilo::NeuronOpFusion": 4.7999998059822246e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 3.899999865097925e-05, + "hilo::ScheduleFusion": 1.9999999949504854e-06, + "hilo::SixtyFourHack": 6.200000643730164e-05, + "hilo::VerifyAliasing": 4.999999873689376e-06, + "hlo-mac-count": 0.0018479999853298068, + "hlo-verifier": 0.007563999388366938, + "instruction-histogram": 0.002553999889642, + "io-con-pipe-begin": 4.999999873689376e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0013040000339969993, + "io-statistics": 6.500000017695129e-05, + "legalize-ccops": 3.000000106112566e-06, + "legalize-compare": 1.1000000085914508e-05, + "lower-argminmax-custom-call": 1.2000000424450263e-05, + "map-inline": 0.0008759999764151871, + "metadata-naming": 6.0999998822808266e-05, + "mlir::detail::OpToOpPassAdaptor": 7.200000254670158e-05, + "mlir::hlo::MhloToPyPenguin": 0.002776999957859516, + "mlir::mhlo::LowerComplexExtraPass": 0.00023499999952036887, + "mlir::mhlo::LowerComplexPass": 0.00032500000088475645, + "native-to-custom-softmax": 0.0007319999858736992, + "native-to-custom-softmax-dx": 0.000678999989759177, + "operand_upcaster": 4.900000203633681e-05, + "opt-barrier-removal": 0.0005629999795928597, + "post-par-pipe-begin": 9.000000318337698e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.001663000090047717, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 9.999999974752427e-07, + "pre-partition-simplification": 0.13888800144195557, + "replace-minimum-constant": 0.0007169999880716205, + "reshape-mover": 0.00021499999274965376, + "simplify-concat": 0.00014099999680183828, + "simplify-while-loops": 0.00017800000205170363, + "transform-variadic-reduce": 6.70000008540228e-05, + "tuple-simplifier": 0.0005469999159686267, + "unpack-nested-aws-ntwsr": 0.00046300000394694507, + "unroll-while-loop": 3.099999958067201e-05, + "zero_sized_hlo_elimination": 0.0008880000095814466 + }, + "hilo": { + "ConstantSize": 304437.0, + "HloInputCount": 475.0, + "HloMacCount": 25141444608.0, + "HloOutputCount": 73.0, + "IfmapSize": 8266542080.0, + "OfmapSize": 75497472.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 1649111936.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 42834.0, + "StaticProfiler::AifUb": 129.43267822265625, + "StaticProfiler::ArithmeticIntensityTensorizer": 128.19729614257813, + "StaticProfiler::AverageDmaLength": 4810.17578125, + "StaticProfiler::DDRTransferBytes": 782946624.0, + "StaticProfiler::InternalTransferBytes": 629086720.0, + "StaticProfiler::LoadExpanded": 97814.0, + "StaticProfiler::StoreExpanded": 1757.0, + "StaticProfiler::TotalDMAExpanded": 99571.0, + "StaticProfiler::TotalDynamicInstancesCount": 50031.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 49585.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 22464.0, + "TilingProfiler::NumPfTransposes": 5.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 19105.0, + "TilingProfiler::PfTransposeInstructionsForIo": 19008.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 96.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 4.0, + "TilingProfiler::SimdInstructionsAfterTiling": 158.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "all": { + "compiletime": { + "algsimp": 0.004207999911159277, + "call-inliner": 0.0007350000087171793, + "collective-stream-id-checker": 6.399999983841553e-05, + "comparison-expander": 0.0005949999904260039, + "constant-statistics": 0.0005649999948218465, + "constant_folding": 0.0005249999812804163, + "dce": 0.0001429999974789098, + "dot_decomposer": 0.0013859999598935246, + "eliminate-redundant-compare": 0.0004839999892283231, + "flatten-call-graph": 0.000901999999769032, + "hlo-mac-count": 0.0016270000487565994, + "hlo-verifier": 0.007029999978840351, + "instruction-histogram": 0.002553999889642, + "io-con-pipe-begin": 4.999999873689376e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0013040000339969993, + "io-statistics": 6.500000017695129e-05, + "map-inline": 0.000838999985717237, + "native-to-custom-softmax": 0.0007050000131130219, + "native-to-custom-softmax-dx": 0.0005089999758638442, + "opt-barrier-removal": 0.0005629999795928597, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 9.999999974752427e-07, + "pre-partition-simplification": 0.13888800144195557, + "replace-minimum-constant": 0.0006949999951757491, + "reshape-mover": 0.00020500000391621143, + "simplify-while-loops": 0.0001720000000204891, + "tuple-simplifier": 0.0005319999763742089, + "unpack-nested-aws-ntwsr": 0.000450999999884516, + "unroll-while-loop": 2.9000000722589903e-05, + "zero_sized_hlo_elimination": 0.0008880000095814466 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.00020933151245117188, + "DMALocalityOpt": 0.0001666545867919922, + "DMAProfiler": 0.0008401870727539063, + "DataStreaming": 0.0002658367156982422, + "DoNothing": 0.00014090538024902344, + "ExpandISAMacro": 0.0004999637603759766, + "FactorizeBlkDims": 0.00046062469482421875, + "InferPSumTensor": 0.0004820823669433594, + "LateLegalizeInst": 0.0004343986511230469, + "LateNeuronInstComb": 0.0004832744598388672, + "LegalizeSundaAccess": 0.002238750457763672, + "LegalizeType": 0.0002429485321044922, + "LowerBroadcast": 0.0002453327178955078, + "LowerIntrinsics": 0.00021791458129882813, + "LowerTranspose": 0.00022292137145996094, + "NeuronInstComb": 0.0005400180816650391, + "NeuronLICM": 0.0003840923309326172, + "NeuronSimplifyPredicates": 0.0028014183044433594, + "NeuronValueNumbering": 0.00042724609375, + "SFKVectorizer": 0.0028204917907714844, + "SimpleAllReduceTiling": 0.0002048015594482422, + "SimplifyNeuronTensor": 0.00043082237243652344, + "SpillPSum": 0.0005221366882324219, + "WeightCoalescing": 0.00020456314086914063 + } + }, + "sg00": { + "compiletime": { + "CanonicalizeConv": 1.4000000192027073e-05, + "CanonicalizeForTensorizer": 1.4999999621068127e-05, + "Canonicalizer": 0.0003440000000409782, + "HoistCompute": 3.000000106112566e-06, + "IdentifyCrossPassTensors": 3.099999958067201e-05, + "MemcastMotion": 1.2000000424450263e-05, + "PenguinizeFunctions": 1.5999999959603883e-05, + "PruneFunctions": 1.2999999853491317e-05, + "RemoveOptimizationBarriers": 2.4000000848900527e-05, + "ScatterMotion": 1.1000000085914508e-05, + "TensorizerLegalizationPass": 2.9000000722589903e-05, + "VerifySupportedOps": 1.1000000085914508e-05, + "algsimp": 6.70000008540228e-05, + "batchnorm_expander": 1.2999999853491317e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 1.1000000085914508e-05, + "canonicalize-boundary-marker": 6.000000212225132e-06, + "collective-stream-id-checker": 3.000000106112566e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 1.700000029813964e-05, + "conditional-to-select": 4.999999873689376e-06, + "config-lowering": 3.199999991920777e-05, + "constant_folding": 9.000000318337698e-06, + "cse": 1.2999999853491317e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.999999989900971e-06, + "emit-offloaded-dropout": 1.2999999853491317e-05, + "flatten-call-graph": 1.1000000085914508e-05, + "fuse-send-recv": 2.4000000848900527e-05, + "hilo::LegalizeAlias": 4.999999873689376e-06, + "hilo::NeuronInstCombine": 6.0999998822808266e-05, + "hilo::NeuronOpFusion": 7.000000096013537e-06, + "hilo::ReplaceTokenTypeWithU8Pass": 1.5999999959603883e-05, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 1.4999999621068127e-05, + "hilo::VerifyAliasing": 1.9999999949504854e-06, + "hlo-mac-count": 2.5999999706982635e-05, + "hlo-verifier": 0.00018699999782256782, + "legalize-ccops": 9.999999974752427e-07, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.999999989900971e-06, + "map-inline": 1.2000000424450263e-05, + "metadata-naming": 2.099999983329326e-05, + "mlir::detail::OpToOpPassAdaptor": 2.300000051036477e-05, + "mlir::hlo::MhloToPyPenguin": 0.0010349999647587538, + "mlir::mhlo::LowerComplexExtraPass": 8.600000001024455e-05, + "mlir::mhlo::LowerComplexPass": 0.0001740000006975606, + "native-to-custom-softmax": 1.4000000192027073e-05, + "native-to-custom-softmax-dx": 0.0001340000017080456, + "operand_upcaster": 1.8000000636675395e-05, + "post-par-pipe-begin": 6.000000212225132e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0005830000154674053, + "replace-minimum-constant": 7.000000096013537e-06, + "reshape-mover": 3.999999989900971e-06, + "simplify-concat": 4.8000001697801054e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 9.000000318337698e-06, + "tuple-simplifier": 4.999999873689376e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 8.198826789855957, + "ConstantSize": 304437.0, + "HloInputCount": 475.0, + "HloMacCount": 2751463424.0, + "HloOutputCount": 73.0, + "IfmapSize": 8266542080.0, + "OfmapSize": 75497472.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 671184704.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 0.04074835777282715, + "AffinePredicateResolution": 0.002183198928833008, + "AliasDependencyElimination": 0.00012922286987304688, + "AliasDependencyInduction": 0.008634567260742188, + "AliasDependencyReset": 0.03679013252258301, + "BFComputeCutting": 0.0019538402557373047, + "BirCodeGenLoop": 0.04571366310119629, + "CCOpFusion": 0.01575756072998047, + "CanonicalizeDAGForPGTiling": 0.003149271011352539, + "CanonicalizeIR": 0.002719879150390625, + "CoalesceCCOp": 0.0047032833099365234, + "CommuteConcat": 0.0013585090637207031, + "DMALocalityOpt": 0.001116037368774414, + "DMAProfiler": 0.0047032833099365234, + "DMATilingProfiler": 0.004144191741943359, + "DataLocalityOpt": 0.10100674629211426, + "DataStreaming": 0.0033788681030273438, + "DeConcat": 0.0007069110870361328, + "DeadCodeElimination": 0.0010058879852294922, + "DeadStoreElimination": 0.031080961227416992, + "DelinearIndices": 0.007829427719116211, + "Delinearization": 0.003365039825439453, + "DoNothing": 7.033348083496094e-05, + "DramToDramTranspose": 0.024500370025634766, + "DumpGraphAndMetadata": 0.005262136459350586, + "EliminateDivs": 0.005412578582763672, + "ExpandBatchNorm": 0.0019643306732177734, + "ExpandISAMacro": 0.002582550048828125, + "FactorizeBlkDims": 0.00794839859008789, + "FactorizeThreadAxesInFreeDims": 0.0020449161529541016, + "FlattenMacroLoop": 0.0028934478759765625, + "GenericAccessSimplifier": 0.0009298324584960938, + "InferInitValue": 0.026146411895751953, + "InferIntrinsicOnCC": 0.010050058364868164, + "InferNeuronTensor": 0.03407764434814453, + "InferNonlocalTensors": 0.06189298629760742, + "InferPSumTensor": 0.03060150146484375, + "InlineNativeKernels": 0.0014431476593017578, + "InsertIOTransposes": 0.009805679321289063, + "InsertLocalTransposes": 0.007609128952026367, + "InsertOffloadedTransposes": 0.004189968109130859, + "LICM": 0.0029850006103515625, + "LateLegalizeInst": 0.004921674728393555, + "LateLegalizePostSplit": 0.0025641918182373047, + "LateLowerReshapeOp": 0.002185821533203125, + "LateLowerTensorOp": 0.00531768798828125, + "LateNeuronInstComb": 0.02812671661376953, + "LayoutPreprocessing": 0.11982965469360352, + "LayoutPreprocessingAndAnalysis": 0.24928760528564453, + "LayoutRequirementAnalysis": 0.007187366485595703, + "LegalizeCCOpLayout": 0.0035941600799560547, + "LegalizeOpLevelAlias": 0.0022826194763183594, + "LegalizePartitionReduce": 0.002084970474243164, + "LegalizeSundaAccess": 0.03499269485473633, + "LegalizeSundaMacro": 0.00858449935913086, + "LegalizeType": 0.0038924217224121094, + "LocalLayoutOpt": 0.015146255493164063, + "LoopFusion": 0.00600433349609375, + "LoopSplitting": 0.0003192424774169922, + "LowerBroadcast": 0.0030934810638427734, + "LowerCCOpBlockAxis": 0.0053822994232177734, + "LowerComplexBroadcast": 0.0017805099487304688, + "LowerIntrinsics": 0.03145861625671387, + "LowerTensorOp": 0.013553142547607422, + "LowerTranspose": 0.008147954940795898, + "MacroGeneration": 0.10158348083496094, + "MaskPropagation": 0.004988193511962891, + "MemcpyElimination": 0.1091456413269043, + "MutateDataType": 0.002095937728881836, + "NeuronAliasDependencyInduction": 0.00023055076599121094, + "NeuronAliasDependencyReset": 0.036977291107177734, + "NeuronInstComb": 0.01214146614074707, + "NeuronLICM": 0.007807016372680664, + "NeuronLoopFusion": 0.014447927474975586, + "NeuronLoopInterchange": 0.0015079975128173828, + "NeuronSimplifier": 0.009031057357788086, + "NeuronSimplifyPredicates": 0.0026018619537353516, + "NeuronValueNumbering": 0.00443577766418457, + "OptimizeAliasedCopyChain": 0.0012700557708740234, + "OptimizeNKIKernels": 0.00177764892578125, + "PAGLayoutOpt": 0.3914484977722168, + "PComputeCutting": 0.005900144577026367, + "PGLayoutTilingPipeline": 1.2139532566070557, + "PGTiling": 0.2603449821472168, + "PadElimination": 0.00040340423583984375, + "ParAxesAnnotation": 0.2578258514404297, + "PartialLoopFusion": 0.010677099227905273, + "PartialSimdFusion": 0.011437177658081055, + "PerfectLoopNest": 0.001963376998901367, + "RecognizeOpIdiom": 0.004378318786621094, + "Recompute": 0.0002574920654296875, + "RelaxPredicates": 0.003600597381591797, + "Rematerialization": 0.004474163055419922, + "ReshapeWeights": 0.0006759166717529297, + "ResolveAccessConflict": 0.003798246383666992, + "ResolveComplicatePredicates": 0.002101421356201172, + "RewriteReplicationMatmul": 0.0012481212615966797, + "RewriteWeights": 0.004036903381347656, + "SFKVectorizer": 0.09602093696594238, + "SimpleAllReduceTiling": 0.0017740726470947266, + "Simplifier": 0.004450559616088867, + "SimplifyMacroPredicates": 0.010053157806396484, + "SimplifyNeuronTensor": 0.00724029541015625, + "SimplifySlice": 0.001153707504272461, + "SimplifyTensor": 0.005860805511474609, + "SpillPSum": 0.011501789093017578, + "SplitAPUnionSets": 0.03104996681213379, + "SplitAccGrp": 0.002181529998779297, + "StaticProfiler": 0.004481792449951172, + "StaticTransposeLocalTensor": 0.006117343902587891, + "SundaISel": 0.041422128677368164, + "TCTransform": 0.0022428035736083984, + "TensorInitialization": 0.00680994987487793, + "TensorOpSimplifier": 0.008346796035766602, + "TensorOpTransform": 0.030104398727416992, + "TileCCOps": 0.005553245544433594, + "TilingProfiler": 0.009899139404296875, + "TransformConvOp": 0.0027108192443847656, + "TritiumFusion": 0.020798206329345703, + "ValueNumbering": 0.003211498260498047, + "VectorizeDMA": 0.004341602325439453, + "VectorizeMatMult": 0.0021800994873046875, + "WeightCoalescing": 0.0030617713928222656, + "ZeroSizeTensorElimination": 0.00011968612670898438 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 1396.0, + "StaticProfiler::AifUb": 8.992382049560547, + "StaticProfiler::ArithmeticIntensityTensorizer": 75.54261016845703, + "StaticProfiler::AverageDmaLength": 9594.294921875, + "StaticProfiler::AverageFractalPeUtilization": 99.893310546875, + "StaticProfiler::AveragePartitionUtilization": 94.61784362792969, + "StaticProfiler::AveragePeUtilization": 99.893310546875, + "StaticProfiler::DDRTransferBytes": 79837440.0, + "StaticProfiler::InternalTransferBytes": 9797632.0, + "StaticProfiler::LoadExpanded": 11010.0, + "StaticProfiler::LocalizationEfficiency": 840.0734252929688, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1527.2347412109375, + "StaticProfiler::StoreExpanded": 3073.0, + "StaticProfiler::TotalDMAExpanded": 14083.0, + "StaticProfiler::TotalDynamicInstancesCount": 1442.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 1442.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 12.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 821.0, + "TilingProfiler::NumPfTransposes": 6.0, + "TilingProfiler::NumPfTransposesForIo": 0.0, + "TilingProfiler::NumPfTransposesForLocal": 5.0, + "TilingProfiler::NumPfTransposesForNonlocal": 1.0, + "TilingProfiler::PfTransposeInstructions": 72.0, + "TilingProfiler::PfTransposeInstructionsForIo": 0.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 56.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 16.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 101.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0001": { + "compiletime": { + "AGOrderingAnalysisPass": 0.034732818603515625, + "AffinePredicateResolution": 0.0015087127685546875, + "AliasDependencyElimination": 0.0001227855682373047, + "AliasDependencyInduction": 0.008313894271850586, + "AliasDependencyReset": 0.044220924377441406, + "BFComputeCutting": 0.001974344253540039, + "BirCodeGenLoop": 0.03118896484375, + "CCOpFusion": 0.018246889114379883, + "CanonicalizeDAGForPGTiling": 0.003057718276977539, + "CanonicalizeIR": 0.0027036666870117188, + "CoalesceCCOp": 0.0046405792236328125, + "CommuteConcat": 0.0015790462493896484, + "DMALocalityOpt": 0.0015497207641601563, + "DMAProfiler": 0.004349708557128906, + "DMATilingProfiler": 0.003928422927856445, + "DataLocalityOpt": 0.12123703956604004, + "DataStreaming": 0.0025773048400878906, + "DeConcat": 0.0008485317230224609, + "DeadCodeElimination": 0.0012981891632080078, + "DeadStoreElimination": 0.034687042236328125, + "DelinearIndices": 0.009628772735595703, + "Delinearization": 0.003772258758544922, + "DoNothing": 7.009506225585938e-05, + "DramToDramTranspose": 0.028621673583984375, + "DumpGraphAndMetadata": 0.003651142120361328, + "EliminateDivs": 0.004262447357177734, + "ExpandBatchNorm": 0.002134084701538086, + "ExpandISAMacro": 0.0026290416717529297, + "FactorizeBlkDims": 0.009716033935546875, + "FactorizeThreadAxesInFreeDims": 0.0013210773468017578, + "FlattenMacroLoop": 0.002851247787475586, + "GenericAccessSimplifier": 0.002216815948486328, + "InferInitValue": 0.03134632110595703, + "InferIntrinsicOnCC": 0.011671781539916992, + "InferNeuronTensor": 0.039717674255371094, + "InferNonlocalTensors": 0.030872583389282227, + "InferPSumTensor": 0.022834062576293945, + "InlineNativeKernels": 0.0021605491638183594, + "InsertIOTransposes": 0.017906904220581055, + "InsertLocalTransposes": 0.007941961288452148, + "InsertOffloadedTransposes": 0.0032515525817871094, + "LICM": 0.003479480743408203, + "LateLegalizeInst": 0.003596782684326172, + "LateLegalizePostSplit": 0.002257108688354492, + "LateLowerReshapeOp": 0.0018393993377685547, + "LateLowerTensorOp": 0.005475044250488281, + "LateNeuronInstComb": 0.017774581909179688, + "LayoutPreprocessing": 0.03530263900756836, + "LayoutPreprocessingAndAnalysis": 0.11916303634643555, + "LayoutRequirementAnalysis": 0.007796525955200195, + "LegalizeCCOpLayout": 0.0019328594207763672, + "LegalizeOpLevelAlias": 0.001219034194946289, + "LegalizePartitionReduce": 0.0009839534759521484, + "LegalizeSundaAccess": 0.015137434005737305, + "LegalizeSundaMacro": 0.010521173477172852, + "LegalizeType": 0.004090547561645508, + "LocalLayoutOpt": 0.020325422286987305, + "LoopFusion": 0.006730556488037109, + "LoopSplitting": 0.00034809112548828125, + "LowerBroadcast": 0.001789093017578125, + "LowerCCOpBlockAxis": 0.005074977874755859, + "LowerComplexBroadcast": 0.0019309520721435547, + "LowerIntrinsics": 0.03209352493286133, + "LowerTensorOp": 0.012279510498046875, + "LowerTranspose": 0.010157585144042969, + "MacroGeneration": 0.09246373176574707, + "MaskPropagation": 0.003335237503051758, + "MemcpyElimination": 0.10414385795593262, + "MutateDataType": 0.00220489501953125, + "NeuronAliasDependencyInduction": 0.0002532005310058594, + "NeuronAliasDependencyReset": 0.03873252868652344, + "NeuronInstComb": 0.012767791748046875, + "NeuronLICM": 0.006428241729736328, + "NeuronLoopFusion": 0.01547694206237793, + "NeuronLoopInterchange": 0.0012590885162353516, + "NeuronSimplifier": 0.009620428085327148, + "NeuronSimplifyPredicates": 0.0022652149200439453, + "NeuronValueNumbering": 0.0031261444091796875, + "OptimizeAliasedCopyChain": 0.0007045269012451172, + "OptimizeNKIKernels": 0.0022683143615722656, + "PAGLayoutOpt": 0.11684298515319824, + "PComputeCutting": 0.0060575008392333984, + "PGLayoutTilingPipeline": 1.5194215774536133, + "PGTiling": 0.5792257785797119, + "PadElimination": 0.0004138946533203125, + "ParAxesAnnotation": 0.08577656745910645, + "PartialLoopFusion": 0.010853052139282227, + "PartialSimdFusion": 0.010831356048583984, + "PerfectLoopNest": 0.0021359920501708984, + "RecognizeOpIdiom": 0.004781246185302734, + "Recompute": 0.00029349327087402344, + "RelaxPredicates": 0.0031125545501708984, + "Rematerialization": 0.002535104751586914, + "ReshapeWeights": 0.0007915496826171875, + "ResolveAccessConflict": 0.004204988479614258, + "ResolveComplicatePredicates": 0.0014605522155761719, + "RewriteReplicationMatmul": 0.0014035701751708984, + "RewriteWeights": 0.0033304691314697266, + "SFKVectorizer": 0.11060166358947754, + "SimpleAllReduceTiling": 0.0013706684112548828, + "Simplifier": 0.004431247711181641, + "SimplifyMacroPredicates": 0.005709409713745117, + "SimplifyNeuronTensor": 0.005321979522705078, + "SimplifySlice": 0.0020780563354492188, + "SimplifyTensor": 0.00576329231262207, + "SpillPSum": 0.01259160041809082, + "SplitAPUnionSets": 0.009907007217407227, + "SplitAccGrp": 0.0010552406311035156, + "StaticProfiler": 0.0033452510833740234, + "StaticTransposeLocalTensor": 0.005699634552001953, + "SundaISel": 0.04179859161376953, + "TCTransform": 0.0024602413177490234, + "TensorInitialization": 0.0022628307342529297, + "TensorOpSimplifier": 0.006663322448730469, + "TensorOpTransform": 0.03399252891540527, + "TileCCOps": 0.0057027339935302734, + "TilingProfiler": 0.01235508918762207, + "TransformConvOp": 0.0025110244750976563, + "TritiumFusion": 0.04343080520629883, + "ValueNumbering": 0.0041046142578125, + "VectorizeDMA": 0.0033228397369384766, + "VectorizeMatMult": 0.0033872127532958984, + "WeightCoalescing": 0.0023338794708251953, + "ZeroSizeTensorElimination": 0.00011372566223144531 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 5268.0, + "StaticProfiler::AifUb": 127.58392333984375, + "StaticProfiler::ArithmeticIntensityTensorizer": 129.38287353515625, + "StaticProfiler::AverageDmaLength": 6718.79638671875, + "StaticProfiler::AverageFractalPeUtilization": 100.0, + "StaticProfiler::AveragePartitionUtilization": 99.92172241210938, + "StaticProfiler::AveragePeUtilization": 100.0, + "StaticProfiler::DDRTransferBytes": 198661120.0, + "StaticProfiler::InternalTransferBytes": 10321920.0, + "StaticProfiler::LoadExpanded": 27264.0, + "StaticProfiler::LocalizationEfficiency": 101.41001892089844, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 103.59725189208984, + "StaticProfiler::StoreExpanded": 1153.0, + "StaticProfiler::TotalDMAExpanded": 28417.0, + "StaticProfiler::TotalDynamicInstancesCount": 5111.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 5111.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 8.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 4276.0, + "TilingProfiler::NumPfTransposes": 8.0, + "TilingProfiler::NumPfTransposesForIo": 3.0, + "TilingProfiler::NumPfTransposesForLocal": 3.0, + "TilingProfiler::NumPfTransposesForNonlocal": 2.0, + "TilingProfiler::PfTransposeInstructions": 122.0, + "TilingProfiler::PfTransposeInstructionsForIo": 34.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 24.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 64.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 180.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0002": { + "compiletime": { + "AGOrderingAnalysisPass": 0.018787622451782227, + "AffinePredicateResolution": 0.0011818408966064453, + "AliasDependencyElimination": 0.00011801719665527344, + "AliasDependencyInduction": 0.005483388900756836, + "AliasDependencyReset": 0.026019811630249023, + "BFComputeCutting": 0.00225830078125, + "BirCodeGenLoop": 0.4621126651763916, + "CCOpFusion": 0.01928091049194336, + "CanonicalizeDAGForPGTiling": 0.004612922668457031, + "CanonicalizeIR": 0.0017774105072021484, + "CoalesceCCOp": 0.014393329620361328, + "CommuteConcat": 0.0020241737365722656, + "DMALocalityOpt": 0.0052585601806640625, + "DMAProfiler": 0.011700868606567383, + "DMATilingProfiler": 0.004782676696777344, + "DataLocalityOpt": 0.06629562377929688, + "DataStreaming": 0.03746771812438965, + "DeConcat": 0.0006563663482666016, + "DeadCodeElimination": 0.002358675003051758, + "DeadStoreElimination": 0.0055620670318603516, + "DelinearIndices": 0.004741668701171875, + "Delinearization": 0.0036110877990722656, + "DoNothing": 8.368492126464844e-05, + "DramToDramTranspose": 0.016016721725463867, + "DumpGraphAndMetadata": 0.0853111743927002, + "EliminateDivs": 0.0025675296783447266, + "ExpandBatchNorm": 0.002092123031616211, + "ExpandISAMacro": 0.010552406311035156, + "FactorizeBlkDims": 0.0076830387115478516, + "FactorizeThreadAxesInFreeDims": 0.002122640609741211, + "FlattenMacroLoop": 0.002187013626098633, + "GenericAccessSimplifier": 0.0009529590606689453, + "InferInitValue": 0.0242159366607666, + "InferIntrinsicOnCC": 0.009269952774047852, + "InferNeuronTensor": 0.020155906677246094, + "InferNonlocalTensors": 0.015646696090698242, + "InferPSumTensor": 0.3076965808868408, + "InlineNativeKernels": 0.009155511856079102, + "InsertIOTransposes": 0.015281438827514648, + "InsertLocalTransposes": 0.006501436233520508, + "InsertOffloadedTransposes": 0.002702474594116211, + "LICM": 0.002913951873779297, + "LateLegalizeInst": 0.013724088668823242, + "LateLegalizePostSplit": 0.012693405151367188, + "LateLowerReshapeOp": 0.0025734901428222656, + "LateLowerTensorOp": 0.001531362533569336, + "LateNeuronInstComb": 0.008354902267456055, + "LayoutPreprocessing": 0.026634931564331055, + "LayoutPreprocessingAndAnalysis": 0.5595176219940186, + "LayoutRequirementAnalysis": 0.005538463592529297, + "LegalizeCCOpLayout": 0.0022728443145751953, + "LegalizeOpLevelAlias": 0.001255035400390625, + "LegalizePartitionReduce": 0.001256704330444336, + "LegalizeSundaAccess": 0.07487797737121582, + "LegalizeSundaMacro": 0.010920286178588867, + "LegalizeType": 0.012901067733764648, + "LocalLayoutOpt": 0.012011289596557617, + "LoopFusion": 0.006572723388671875, + "LoopSplitting": 0.0003001689910888672, + "LowerBroadcast": 0.0016355514526367188, + "LowerCCOpBlockAxis": 0.0050678253173828125, + "LowerComplexBroadcast": 0.0025262832641601563, + "LowerIntrinsics": 0.30371904373168945, + "LowerTensorOp": 0.011744022369384766, + "LowerTranspose": 0.011518478393554688, + "MacroGeneration": 0.026911020278930664, + "MaskPropagation": 0.0031325817108154297, + "MemcpyElimination": 0.027472257614135742, + "MutateDataType": 0.0015196800231933594, + "NeuronAliasDependencyInduction": 0.00016927719116210938, + "NeuronAliasDependencyReset": 0.0242006778717041, + "NeuronInstComb": 0.004147529602050781, + "NeuronLICM": 0.036264657974243164, + "NeuronLoopFusion": 0.00889277458190918, + "NeuronLoopInterchange": 0.002141237258911133, + "NeuronSimplifier": 0.00720524787902832, + "NeuronSimplifyPredicates": 0.11929655075073242, + "NeuronValueNumbering": 0.003022432327270508, + "OptimizeAliasedCopyChain": 0.0006387233734130859, + "OptimizeNKIKernels": 0.5260024070739746, + "PAGLayoutOpt": 0.5680239200592041, + "PComputeCutting": 0.0048143863677978516, + "PGLayoutTilingPipeline": 1.6304676532745361, + "PGTiling": 0.1616363525390625, + "PadElimination": 0.0003521442413330078, + "ParAxesAnnotation": 0.0544736385345459, + "PartialLoopFusion": 0.005907773971557617, + "PartialSimdFusion": 0.0038967132568359375, + "PerfectLoopNest": 0.0021576881408691406, + "RecognizeOpIdiom": 0.0039520263671875, + "Recompute": 0.0002884864807128906, + "RelaxPredicates": 0.013870716094970703, + "Rematerialization": 0.0024657249450683594, + "ReshapeWeights": 0.0006930828094482422, + "ResolveAccessConflict": 0.0038983821868896484, + "ResolveComplicatePredicates": 0.0012950897216796875, + "RewriteReplicationMatmul": 0.002060413360595703, + "RewriteWeights": 0.0028791427612304688, + "SFKVectorizer": 0.28761887550354004, + "SimpleAllReduceTiling": 0.008704662322998047, + "Simplifier": 0.003449678421020508, + "SimplifyMacroPredicates": 0.010317325592041016, + "SimplifyNeuronTensor": 1.0378923416137695, + "SimplifySlice": 0.0008852481842041016, + "SimplifyTensor": 0.005218982696533203, + "SpillPSum": 0.009551286697387695, + "SplitAPUnionSets": 0.10591006278991699, + "SplitAccGrp": 0.0011169910430908203, + "StaticProfiler": 0.01290583610534668, + "StaticTransposeLocalTensor": 0.003824472427368164, + "SundaISel": 0.041872262954711914, + "TCTransform": 0.0008666515350341797, + "TensorInitialization": 0.013058185577392578, + "TensorOpSimplifier": 0.0061550140380859375, + "TensorOpTransform": 0.020328521728515625, + "TileCCOps": 0.006834983825683594, + "TilingProfiler": 0.0072863101959228516, + "TransformConvOp": 0.0032320022583007813, + "TritiumFusion": 0.03062152862548828, + "ValueNumbering": 0.0023603439331054688, + "VectorizeDMA": 0.004430294036865234, + "VectorizeMatMult": 0.0021605491638183594, + "WeightCoalescing": 0.00825953483581543, + "ZeroSizeTensorElimination": 0.00011014938354492188 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 42834.0, + "StaticProfiler::AifUb": 129.43267822265625, + "StaticProfiler::ArithmeticIntensityTensorizer": 128.19729614257813, + "StaticProfiler::AverageDmaLength": 4810.17578125, + "StaticProfiler::AverageFractalPeUtilization": 99.65389251708984, + "StaticProfiler::AveragePartitionUtilization": 97.55139923095703, + "StaticProfiler::AveragePeUtilization": 98.60253143310547, + "StaticProfiler::DDRTransferBytes": 782946624.0, + "StaticProfiler::InternalTransferBytes": 629086720.0, + "StaticProfiler::LoadExpanded": 97814.0, + "StaticProfiler::LocalizationEfficiency": 99.04553985595703, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 100.20111846923828, + "StaticProfiler::StoreExpanded": 1757.0, + "StaticProfiler::TotalDMAExpanded": 99571.0, + "StaticProfiler::TotalDynamicInstancesCount": 50031.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 49585.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 22464.0, + "TilingProfiler::NumPfTransposes": 5.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 19105.0, + "TilingProfiler::PfTransposeInstructionsForIo": 19008.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 96.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 4.0, + "TilingProfiler::SimdInstructionsAfterTiling": 158.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg01": { + "compiletime": { + "CanonicalizeConv": 2.300000051036477e-05, + "CanonicalizeForTensorizer": 1.2999999853491317e-05, + "Canonicalizer": 0.00028199999360367656, + "HoistCompute": 3.000000106112566e-06, + "IdentifyCrossPassTensors": 2.4000000848900527e-05, + "MemcastMotion": 9.999999747378752e-06, + "PenguinizeFunctions": 1.4000000192027073e-05, + "PruneFunctions": 1.2000000424450263e-05, + "RemoveOptimizationBarriers": 2.2000000171829015e-05, + "ScatterMotion": 1.700000029813964e-05, + "TensorizerLegalizationPass": 3.400000059627928e-05, + "VerifySupportedOps": 1.1000000085914508e-05, + "algsimp": 6.500000017695129e-05, + "batchnorm_expander": 1.2000000424450263e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 9.999999747378752e-06, + "canonicalize-boundary-marker": 4.999999873689376e-06, + "collective-stream-id-checker": 3.999999989900971e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 2.099999983329326e-05, + "conditional-to-select": 4.999999873689376e-06, + "config-lowering": 2.9000000722589903e-05, + "constant_folding": 9.000000318337698e-06, + "cse": 1.2000000424450263e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.999999989900971e-06, + "emit-offloaded-dropout": 1.2999999853491317e-05, + "flatten-call-graph": 9.000000318337698e-06, + "fuse-send-recv": 2.9000000722589903e-05, + "hilo::LegalizeAlias": 4.999999873689376e-06, + "hilo::NeuronInstCombine": 3.5000000934815034e-05, + "hilo::NeuronOpFusion": 1.2999999853491317e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 1.5999999959603883e-05, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 9.999999747378752e-06, + "hilo::VerifyAliasing": 1.9999999949504854e-06, + "hlo-mac-count": 2.5999999706982635e-05, + "hlo-verifier": 0.00018899999849963933, + "legalize-ccops": 9.999999974752427e-07, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.999999989900971e-06, + "map-inline": 1.2000000424450263e-05, + "metadata-naming": 2.4000000848900527e-05, + "mlir::detail::OpToOpPassAdaptor": 2.2000000171829015e-05, + "mlir::hlo::MhloToPyPenguin": 0.0008980000275187194, + "mlir::mhlo::LowerComplexExtraPass": 7.999999797903001e-05, + "mlir::mhlo::LowerComplexPass": 0.00013499999477062374, + "native-to-custom-softmax": 7.000000096013537e-06, + "native-to-custom-softmax-dx": 1.5999999959603883e-05, + "operand_upcaster": 1.8999999156221747e-05, + "post-par-pipe-begin": 1.9999999949504854e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0005530000198632479, + "replace-minimum-constant": 6.000000212225132e-06, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 4.8999998398358e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 9.000000318337698e-06, + "tuple-simplifier": 4.999999873689376e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 9.999999974752427e-07 + }, + "hilo": { + "ArithmeticIntensity": 123.27030181884766, + "HloMacCount": 12415139840.0, + "Traffic": 201429536.0 + } + }, + "sg02": { + "compiletime": { + "CanonicalizeConv": 0.0, + "CanonicalizeForTensorizer": 1.2999999853491317e-05, + "Canonicalizer": 0.00033599999733269215, + "HoistCompute": 0.0, + "IdentifyCrossPassTensors": 2.2000000171829015e-05, + "MemcastMotion": 0.0, + "PenguinizeFunctions": 9.000000318337698e-06, + "PruneFunctions": 9.999999747378752e-06, + "RemoveOptimizationBarriers": 1.8999999156221747e-05, + "ScatterMotion": 0.0, + "TensorizerLegalizationPass": 6.000000212225132e-06, + "VerifySupportedOps": 1.1000000085914508e-05, + "algsimp": 5.999999848427251e-05, + "batchnorm_expander": 1.1000000085914508e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 1.1000000085914508e-05, + "canonicalize-boundary-marker": 3.999999989900971e-06, + "collective-stream-id-checker": 1.9999999949504854e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 2.099999983329326e-05, + "conditional-to-select": 6.000000212225132e-06, + "config-lowering": 2.5999999706982635e-05, + "constant_folding": 9.000000318337698e-06, + "cse": 1.1000000085914508e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.000000106112566e-06, + "emit-offloaded-dropout": 1.2000000424450263e-05, + "flatten-call-graph": 1.2000000424450263e-05, + "fuse-send-recv": 1.8000000636675395e-05, + "hilo::LegalizeAlias": 1.9999999949504854e-06, + "hilo::NeuronInstCombine": 4.999999873689376e-06, + "hilo::NeuronOpFusion": 2.8000000384054147e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 7.000000096013537e-06, + "hilo::ScheduleFusion": 0.0, + "hilo::SixtyFourHack": 3.7000001611886546e-05, + "hilo::VerifyAliasing": 9.999999974752427e-07, + "hlo-mac-count": 0.00016900000628083944, + "hlo-verifier": 0.00015799999528098851, + "legalize-ccops": 9.999999974752427e-07, + "legalize-compare": 3.000000106112566e-06, + "lower-argminmax-custom-call": 3.999999989900971e-06, + "map-inline": 1.2999999853491317e-05, + "metadata-naming": 1.5999999959603883e-05, + "mlir::detail::OpToOpPassAdaptor": 2.700000004551839e-05, + "mlir::hlo::MhloToPyPenguin": 0.0008440000237897038, + "mlir::mhlo::LowerComplexExtraPass": 6.900000153109431e-05, + "mlir::mhlo::LowerComplexPass": 1.5999999959603883e-05, + "native-to-custom-softmax": 6.000000212225132e-06, + "native-to-custom-softmax-dx": 1.9999999494757503e-05, + "operand_upcaster": 1.2000000424450263e-05, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.000526999996509403, + "replace-minimum-constant": 9.000000318337698e-06, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 4.400000034365803e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 4.8999998398358e-05, + "tuple-simplifier": 4.999999873689376e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 9.999999974752427e-07 + }, + "hilo": { + "ArithmeticIntensity": 25.691875457763672, + "HloMacCount": 9974841344.0, + "Traffic": 776497728.0 + } + } +} \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk0/graph.neff b/context_encoding_model/_tp0_bk0/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..1512ad540261d8bf04a2891e464dc116f454980e --- /dev/null +++ b/context_encoding_model/_tp0_bk0/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0aeace703e08ac36bdcb2027d9a278403cb96ef39f48bddc999b077215e8a36 +size 1557504 diff --git a/context_encoding_model/_tp0_bk0/log-neuron-cc.txt b/context_encoding_model/_tp0_bk0/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..45c4988d4bbc8509a4d93cc7cd0843404012af61 --- /dev/null +++ b/context_encoding_model/_tp0_bk0/log-neuron-cc.txt @@ -0,0 +1,5055 @@ +2025-08-07T13:53:50Z INFO 47449 [root]: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb --output /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=/home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/log-neuron-cc.txt --verbose=35 +2025-08-07T13:53:50Z INFO 47449 [root]: NeuronX Compiler version 2.20.9961.0+0acef03a Python version 3.10.12 HWM version 2.20.0.9961+0acef03a NumPy version 1.26.4 Running on AMI ami-040348201d80b58ad Running in region usw2-az4 +2025-08-07T13:53:50Z INFO 47514 [root]: XLA detected +2025-08-07T13:53:50Z INFO 47514 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-08-07T13:53:50Z INFO 47514 [root]: Intermediate files stored in /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/neuronxcc-vebk23i5, output in /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0 +2025-08-07T13:53:50Z INFO 47514 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-08-07T13:53:50Z INFO 47514 [pipeline.Pipeline.0]: Processing input #0 +2025-08-07T13:53:50Z INFO 47514 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-08-07T13:53:50Z INFO 47514 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-08-07T13:53:50Z INFO 47514 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-08-07T13:53:50Z INFO 47514 [job.HLOToTensorizer.0]: Processing input #0 +2025-08-07T13:53:50Z INFO 47514 [job.HLOToTensorizer.0]: IR signature: d89b9e073981a0b1b7d0bbd0a24f147e9df13c5706d9d6be9971b857124c9496 for model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb +2025-08-07T13:53:50Z INFO 47514 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --layers-per-module=1 --partition --emit-tensor-level-dropout-ops --modular-flow-mac-threshold=10 --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-08-07T13:53:51Z INFO 47514 [job.HLOToTensorizer.0]: DEBUG: needsModular_PreSplit? Yes. macCnt 447256207360 threshold 4398046511104 num non-trivial Ops 3875 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 38 + +Pre-Partition Pre-Opt Histogram: +total HLO instructions: 10629 + reshape 2091 19.67% ################################################################ + broadcast 1735 16.32% ##################################################### + convert 1281 12.05% ####################################### + transpose 1268 11.93% ###################################### + constant 819 7.71% ######################### + parameter 475 4.47% ############## + slice 445 4.19% ############# + add 365 3.43% ########### + multiply 328 3.09% ########## + dot 326 3.07% ######### + get-tuple-element 295 2.78% ######### + select 255 2.40% ####### + compare 222 2.09% ###### + call 186 1.75% ##### + concatenate 148 1.39% #### + tuple 73 0.69% ## + scatter 73 0.69% ## + negate 72 0.68% ## + all-reduce 72 0.68% ## + divide 39 0.37% # + custom-call 38 0.36% # + iota 7 0.07% + gather 6 0.06% + all-gather 3 0.03% + reduce 3 0.03% + sine 1 0.01% + cosine 1 0.01% + power 1 0.01% + maximum 1 0.01% + +INFO: IoStatistics: total inputs: 475 +INFO: IoStatistics: total outputs: 73 +INFO: IoStatistics: total passthrough tensors: 0 +INFO: IoStatistics: total outputs read from: 0 +INFO: IoStatistics: total redundant outputs: 0 +INFO: IoStatistics: total ifmap size (KiB): 8072795 +INFO: IoStatistics: total ofmap size (KiB): 73728 +INFO: IoStatistics: total must-alias size (KiB): 73728 +INFO: IoStatistics: total may-alias size (KiB): 0 +INFO: HloMacCount has found 447256199168 +INFO: Traffic has found 8343919789 +INFO: AIF 107.21 + +Pre-Partition Post-Op Histogram: +total HLO instructions: 6623 + reshape 1424 21.50% ################################################################ + convert 992 14.98% ############################################ + transpose 941 14.21% ########################################## + constant 523 7.90% ####################### + parameter 475 7.17% ##################### + broadcast 410 6.19% ################## + dot 325 4.91% ############## + custom-call 223 3.37% ########## + multiply 219 3.31% ######### + add 219 3.31% ######### + get-tuple-element 151 2.28% ###### + slice 147 2.22% ###### + concatenate 146 2.20% ###### + select 110 1.66% #### + compare 76 1.15% ### + scatter 73 1.10% ### + negate 72 1.09% ### + all-reduce 72 1.09% ### + gather 6 0.09% + iota 5 0.08% + all-gather 3 0.05% + reduce 3 0.05% + pad 2 0.03% + sine 1 0.02% + divide 1 0.02% + tuple 1 0.02% + maximum 1 0.02% + rng 1 0.02% + cosine 1 0.02% + +INFO: Found memory bound graph +DEBUG: needsModular_PreSplit? Yes. macCnt 447256199168 threshold 4398046511104 num non-trivial Ops 2702 +DEBUG: transformer model +INFO: Partitioner configs:ModularFlow BO LBL SA ConcatGraphs: 1 MaxDisj:2 MaxSep:4 LPM:1 +INFO: Markers NOT detected +Potential split-points stats: #CC 75 #AR 72 #AG 3 #BN 0 nClamp 0 +DEBUG: needsModular_SplitFinder? Yes. +ModuleSplitter initial partitioning... #parts 75 +ModuleSplitter initial partitioning... Done. +INFO: Num of unique Module Definitions: 6 +DEBUG: DefMap: 0 1 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 73 74 +New disjoint wave: start 2 len 70 NumReps: 35 macs 434529894400 +INFO: Attempting to identify and split optimizer at end +First non-zero-mac/used part from the end is 73 +Not enough zero-mac parts. skip +INFO: Optimized 0 all-reduce split instructions +INFO: Number of splitPoints: 37 +ModuleSplitter initial partitioning... #parts 37 +ModuleSplitter initial partitioning... Done. +Remat: gather-iota 0 matches, 0 ops rematted +INFO: Alias legality verification of partitions PASSED. +INFO: No transposable_weight_idx attrs found +INFO: Peak intermediate memory demand is at Partition 1. Num live intermediates at peak is 9 and memory usage is 4276228 bytes. +INFO: Please refer to LiveRangeReport_PostHloPart.txt for detailed intermediate lifetime info. +DEBUG: DefMap: 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 36 +Wrote HLO netlist to hlo_netlist.json +Wrote graph partitions in debug_info_hlo_partitions.json +Processing partition 0 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 0 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 2751463424 +INFO: Traffic has found 671184678 +INFO: AIF 8.20 +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call dot gather get-tuple-element iota multiply negate parameter reshape scatter select sine slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 1 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 0 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 12415139840 +INFO: Traffic has found 201429540 +INFO: AIF 123.27 +HLO Ops used in computation: add all-reduce broadcast compare concatenate constant convert custom-call dot get-tuple-element multiply negate parameter reshape scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 2 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 0 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 9974841344 +INFO: Traffic has found 776497739 +INFO: AIF 25.69 +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert custom-call divide dot gather get-tuple-element iota maximum multiply pad parameter reduce reshape rng scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass + +2025-08-07T13:53:51Z INFO 47514 [job.HLOToTensorizer.0]: IR signature: e64db8b38636f1e1de70247cb8ef599fe398def409ef42d3756559fd5fc4b0dd for sg0000/HLOToTensorizer +2025-08-07T13:53:51Z INFO 47514 [job.HLOToTensorizer.0]: IR signature: a31a130bd31092d77c4f2c3afb2624ee27b06f887e825fbab973844c57b282ba for sg0001/HLOToTensorizer +2025-08-07T13:53:51Z INFO 47514 [job.HLOToTensorizer.0]: IR signature: 5686d00929ecdf5c48dcae30b42e946122da025387070f32d1ea0a1c34518e99 for sg0002/HLOToTensorizer +2025-08-07T13:53:51Z INFO 47514 [job.HLOToTensorizer.0]: Job #0 finished +2025-08-07T13:53:51Z INFO 47514 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-08-07T13:53:51Z INFO 47514 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-08-07T13:53:51Z INFO 47514 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-08-07T13:53:51Z INFO 47514 [job.Frontend.0]: Processing input #0 +2025-08-07T13:53:51Z INFO 47514 [job.Frontend.0]: Start model loading +2025-08-07T13:53:51Z INFO 47514 [job.Frontend.0]: Start tensorization +2025-08-07T13:53:51Z INFO 47514 [job.Frontend.0]: Num jobs: 128 +2025-08-07T13:53:51Z USER 47514 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-08-07T13:53:51Z INFO 47514 [Tensorizer]: Max workers: 3 +2025-08-07T13:53:51Z INFO 48703 [Tensorizer]: Building model from Penguin script "penguin.py.000000"... +2025-08-07T13:53:51Z INFO 48704 [Tensorizer]: Building model from Penguin script "penguin.py.000001"... +2025-08-07T13:53:51Z INFO 48705 [Tensorizer]: Building model from Penguin script "penguin.py.000002"... +2025-08-07T13:53:51Z INFO 48704 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-08-07T13:53:51Z INFO 48703 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-08-07T13:53:51Z INFO 48704 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:51Z INFO 48704 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:51Z INFO 48705 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:51Z INFO 48704 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:51Z INFO 48704 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-08-07T13:53:51Z INFO 48704 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48704 [sg0001/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.001 seconds +2025-08-07T13:53:51Z INFO 48704 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-08-07T13:53:51Z INFO 48704 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48704 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-08-07T13:53:51Z INFO 48704 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:51Z INFO 48704 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48704 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-08-07T13:53:51Z INFO 48704 [sg0001/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-08-07T13:53:51Z INFO 48704 [sg0001/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.002 seconds +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.003 seconds +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.001 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.014 seconds +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.003 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.007 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.029 seconds +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.012 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.008 seconds +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.003 seconds +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=True) +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.007 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.037 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.004 seconds +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-08-07T13:53:51Z INFO 48703 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.006 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.002 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/LegalizeCCOpLayout]: Finished (changed=True) +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.002 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.001 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.001 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.003 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.001 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.013 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-08-07T13:53:51Z INFO 48705 [sg0002/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.020 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LateLowerTensorOp]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.005 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.026 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.005 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.027 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.018 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.016 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Rematerialization]: Running Rematerialization +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Rematerialization]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.005 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Rematerialization]: Rematerialization finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.030 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.011 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.005 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.010 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.009 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.037 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.019 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.008 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.006 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.109 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.004 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/PadElimination]: Running PadElimination +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/PadElimination]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/PadElimination]: PadElimination finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.030 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.004 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.014 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.007 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.008 seconds +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.004 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.004 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48703 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.006 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Recompute]: Running Recompute +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Recompute]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 48705 [Tensorizer]: After optimization: 38 statements +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/MutateDataType]: Running MutateDataType +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/MutateDataType]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/MutateDataType]: MutateDataType finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/TileCCOps]: Running TileCCOps +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/TileCCOps]: in float32 (512,) %'all_gather.2' = AllGatherOp-149 AllGather_add(float32 (256,) %'add.11', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.8855 | hlo_id: 101 | , id = 149 +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=2048 is not above min_allgather_tile_size_in_bytes=8388608` +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/TileCCOps]: in uint32 (512,) %'all_gather.3' = AllGatherOp-165 AllGather_add(uint32 (256,) %'add.12', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.8990 | hlo_id: 110 | , id = 165 +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/TileCCOps]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/TileCCOps]: TileCCOps finished after 0.007 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.012 seconds +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.007 seconds +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.031 seconds +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.009 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.005 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-08-07T13:53:52Z INFO 48705 [sg0002/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.007 seconds +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-08-07T13:53:52Z INFO 48704 [sg0001/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.015 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.038 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.007 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.034 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.008 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.044 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.009 seconds +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.012 seconds +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.104 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.007 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.000 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.006 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.029 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/Rematerialization]: Running Rematerialization +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/Rematerialization]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/Rematerialization]: Rematerialization finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.009 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.012 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.027 seconds +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.042 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/TCTransform]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.006 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.031 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 48703 [Tensorizer]: After optimization: 26 statements +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=1048576 is not above min_allgather_tile_size_in_bytes=8388608` +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/TileCCOps]: in bfloat16 (4096, 128) %'all_gather.1' = AllGatherOp-46 AllGather_add(bfloat16 (2048, 128) %'transpose.1', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((4096, 128), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.47 | hlo_id: 19 | , id = 46 +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/TileCCOps]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.007 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.006 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 48704 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.011 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.008 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-08-07T13:53:53Z INFO 48703 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48705 [sg0002/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.006 seconds +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.560 seconds +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.016 seconds +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/PadElimination]: Running PadElimination +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/PadElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/PadElimination]: PadElimination finished after 0.000 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.010 seconds +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.007 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.015 seconds +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.008 seconds +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.120 seconds +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.035 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Recompute]: Running Recompute +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Recompute]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 48704 [Tensorizer]: After optimization: 25 statements +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/MutateDataType]: Running MutateDataType +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/MutateDataType]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/MutateDataType]: MutateDataType finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/TileCCOps]: Running TileCCOps +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/TileCCOps]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/TileCCOps]: TileCCOps finished after 0.006 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.007 seconds +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.249 seconds +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.013 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.054 seconds +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.011 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.062 seconds +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.012 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.007 seconds +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.568 seconds +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.020 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.010 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/PGTiling]: Running PGTiling +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 585 of IO tensor {'CrossPassTensor': ''}bfloat16 %input471|NC|(128, 32) is not sorted, index list (w/ AG ids): [(16, 'AG49'), (13, 'AG50')] +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 586 of IO tensor {'CrossPassTensor': ''}bfloat16 %input472|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(16, 'AG49'), (13, 'AG50')] +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 587 of IO tensor {'CrossPassTensor': ''}bfloat16 %input470|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(16, 'AG49'), (13, 'AG50')] +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 588 of IO tensor {'CrossPassTensor': ''}bfloat16 %input469(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(10, 'AG54'), (15, 'AG52'), (11, 'AG53')] +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 589 of IO tensor {'CrossPassTensor': ''}bfloat16 %input474|NC|(128, 32) is not sorted, index list (w/ AG ids): [(16, 'AG49'), (13, 'AG50')] +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 540 of IO tensor {'CrossPassTensor': ''}bfloat16 %input473|NC|(75968, 32, 128) is not sorted, index list (w/ AG ids): [(14, 'AG59'), (13, 'AG50')] +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.019 seconds +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.000 seconds +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.035 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.008 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.119 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.027 seconds +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/PGTiling]: PGTiling finished after 0.162 seconds +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.031 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.015 seconds +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-08-07T13:53:54Z INFO 48705 [sg0002/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.258 seconds +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.008 seconds +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.391 seconds +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48703 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.086 seconds +2025-08-07T13:53:54Z INFO 48704 [sg0001/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.008 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.016 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 1.630 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 19008: transpose_128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 19008: matmul_128x128x1 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 384: matmul_128x128x512 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 48: simd128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 32: rmsnorm128x128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 32: simd128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 32: rmsnorm128x128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 2: indirect_load128x1 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x1 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x1 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x1 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 1: reduce128x1x1 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 1: reduce128x1x1 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingBottleneck]: 1: indirect_load32x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.117 seconds +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 702 of IO tensor {'CrossPassTensor': ''}bfloat16 %input79|NC|(128, 32) is not sorted, index list (w/ AG ids): [(16, 'AG82'), (15, 'AG83')] +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 703 of IO tensor {'CrossPassTensor': ''}bfloat16 %input83(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(16, 'AG82'), (15, 'AG83')] +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 704 of IO tensor {'CrossPassTensor': ''}bfloat16 %input81(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(16, 'AG82'), (15, 'AG83')] +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 705 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(16, 'AG82'), (15, 'AG83')] +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 459 of IO tensor {'CrossPassTensor': ''}bfloat16 %input77(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(9, 'AG94'), (6, 'AG90'), (7, 'AG89'), (11, 'AG93'), (13, 'AG92')] +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 698 of IO tensor non_local bfloat16 %all_gather.1(32, 128, 128) is not sorted, index list (w/ AG ids): [(16, 'AG82'), (8, 'AG84')] +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.041 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.007 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.006 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.006 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.000 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/PGTiling]: Running PGTiling +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 669 of IO tensor {'CrossPassTensor': ''}bfloat16 %input86|NC|(128, 32) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 670 of IO tensor {'CrossPassTensor': ''}bfloat16 %input87|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 671 of IO tensor {'CrossPassTensor': ''}bfloat16 %input85|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 672 of IO tensor {'CrossPassTensor': ''}bfloat16 %input84(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(7, 'AG90'), (14, 'AG88'), (8, 'AG89')] +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 673 of IO tensor {'CrossPassTensor': ''}bfloat16 %input90|NC|(128, 32) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 674 of IO tensor {'CrossPassTensor': ''}bfloat16 %input94(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 679 of IO tensor {'CrossPassTensor': ''}bfloat16 %input92(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 680 of IO tensor {'CrossPassTensor': ''}bfloat16 %input89|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 447 of IO tensor {'CrossPassTensor': ''}bfloat16 %input88(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(2, 'AG100'), (0, 'AG96'), (1, 'AG95'), (3, 'AG99'), (4, 'AG98')] +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.035 seconds +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.020 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.102 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 0.260 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.006 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.010 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.025 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 1.214 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 512: matmul_128x128x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 128: matmul_128x128x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 128: matmul_128x128x512 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 32: simd32x512 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 32: rmsnorm128x128x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 32: matmul_128x128x512 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 16: matmul_128x128x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 4: indirect_load128x512 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 4: rmsnorm128x512x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 4: simd128x256 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 4: simd128x256 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 4: simd128x512 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 4: transpose_128x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 4: simd128x64 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 4: simd128x64 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingBottleneck]: 4: simd128x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.010 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.009 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.006 seconds +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.066 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 19008: transpose_128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 19008: matmul_128x128x1 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 594: transpose_128x1 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 384: dma128x512 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x512 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 48: dma128x4096 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 48: dma128x4096 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 48: simd128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: simd128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 4: dma128x1024 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 2: indirect_load128x1 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 1: dma128x32 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 1: simd1x1 +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.034 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.006 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.007 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.001 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.011 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.008 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.008 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.006 seconds +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.000 seconds +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.101 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: dma32x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: dma128x512 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x512 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: dma32x512 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: simd32x512 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x128x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x512 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: matmul_128x128x512 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: transpose_128x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: dma128x4096 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: transpose_128x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: transpose_128x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: matmul_128x128x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: transpose_128x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 4: indirect_load128x512 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 4: rmsnorm128x512x128 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 4: simd128x256 +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.010 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/InferInitValue]: Running InferInitValue +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.092 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/InferInitValue]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/PGTiling]: PGTiling finished after 0.579 seconds +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/InferInitValue]: InferInitValue finished after 0.024 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.018 seconds +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.007 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/SimplifyTensor]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48704 [sg0001/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/SundaISel]: Running SundaISel +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.009 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.009 seconds +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/SundaISel]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/SundaISel]: SundaISel finished after 0.042 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.000 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.024 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:55Z INFO 48705 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.009 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.008 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.006 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.003 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.010 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.009 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.006 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 0.026 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.029 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.009 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.006 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.008 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 1.519 seconds +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 512: matmul_128x128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 384: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 128: matmul_128x128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 128: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 48: simd128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 32: rmsnorm128x128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 32: simd128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 32: rmsnorm128x128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 32: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 16: matmul_128x128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 16: softmax128x1x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 4: rmsnorm128x512x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 4: simd64x512 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingBottleneck]: 4: simd64x512 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.025 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.012 seconds +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 0.041 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.010 seconds +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.000 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.037 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.040 seconds +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.014 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.007 seconds +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.007 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.003 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.008 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.001 seconds +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.041 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.005 seconds +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.001 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/DeConcat]: Running DeConcat +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/DeConcat]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/DeConcat]: DeConcat finished after 0.001 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/PartialSimdFusion]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.031 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.012 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.121 seconds +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 384: dma128x512 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: dma128x512 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 48: dma128x4096 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 48: dma128x4096 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 48: simd128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: simd128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x128x128 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x512 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 16: dma128x4096 +2025-08-07T13:53:56Z INFO 48704 [sg0001/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.001 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.006 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.011 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/PartialLoopFusion]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.021 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.006 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.009 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.011 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.005 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.020 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.012 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.008 seconds +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:53:56Z INFO 48703 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.008 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.001 seconds +2025-08-07T13:53:56Z INFO 48705 [sg0002/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/SpillPSum]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/SpillPSum]: SpillPSum finished after 0.010 seconds +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.010 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.011 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.010 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.028 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.010 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.006 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/InferInitValue]: Running InferInitValue +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.012 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/InferInitValue]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/InferInitValue]: InferInitValue finished after 0.031 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.010 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/SimplifyTensor]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.031 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.001 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.008 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.304 seconds +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.006 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.031 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.009 seconds +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/LegalizeType]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/SundaISel]: Running SundaISel +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/LegalizeType]: LegalizeType finished after 0.013 seconds +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/SundaISel]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/SundaISel]: SundaISel finished after 0.042 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.036 seconds +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.000 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.035 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.007 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.007 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.001 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.039 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.015 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.001 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.096 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.005 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.005 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.005 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 273.865us (31.758MiB, est bw: 121.595GB/s, 51.078% of tot. time) for bfloat16<32 x 16260> TongaSB partitions[2] bfloat16 (4, 8, 32, 16260) %'all_gather.1_nostride_1562'(init=0.0)[c0_980,4c1_981_0+c1_981_1,i0.32,i1.16260] = load bfloat16<32 x 16260> non_local bfloat16 (32, 16384) %'all_gather.1'[i0.32,32c0_980+16c1_981_0+i1.16260+4c1_981_1] # id=1137, src_id=None, , attrs={'can_read_uninit': True}, instances=32 # dl = tensor_op_name: UnnamedModule | hlo_id: 1 | [[i0.32];[i1.16260]] -> [[i0.32];[i1.16260]] +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 74.128us (16.000MiB, est bw: 226.329GB/s, 13.825% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (4, 4, 128, 4096) %'input83_local_1033'[i48_0_1297,i32_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (4, 4, 128, 4096) %'input83'[i48_0_1297,i32_0_0_1,i0.128,i1.4096] # id=1179, src_id=None, , instances=16 # dl = tensor_op_name: _dot.2 | hlo_id: 34 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 74.128us (16.000MiB, est bw: 226.329GB/s, 13.825% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (4, 2, 128, 16, 512) %'input77_local_1070'[i122_0_0_0_1076_0,i122_0_0_0_1,i0.128,i3.16,i1.128+256p_1673+128i2.2] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 16, 2, 128) %'input77'[2i122_0_0_0_1076_0+i122_0_0_0_1,p_1673,i0.128,i3.16,i2.2,i1.128] # id=1283, src_id=None, , instances=16 # dl = tensor_op_name: _dot.3 | hlo_id: 145 | [[i0.128];[i1.128, i2.2, i3.16]] -> [[i0.128];[i1.128, i2.2, i3.16]] +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 22.961us (1.000MiB, est bw: 45.668GB/s, 4.282% of tot. time) for bfloat16<128 x 128> TongaSB partitions[1] bfloat16 (32, 128, 128) %'custom-call.226.1457'[i29_0_1019,i0.128,i1.128] = load bfloat16<128 x 128> non_local bfloat16 (32, 16384) %'all_gather.1'[i29_0_1019,128i0.128+i1.128] # id=1174, src_id=None, , instances=32 # dl = tensor_op_name: _custom-call.226 | hlo_id: 27 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 3.638% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (4, 128, 4096) %'input81_local_1046'[i59_0_0_1845,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (4, 128, 4096) %'input81'[i59_0_0_1845,i0.128,i1.4096] # id=1224, src_id=None, , instances=4 # dl = tensor_op_name: _dot.1 | hlo_id: 82 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 3.638% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[0] bfloat16 (128, 32, 512) %'input78_local_1059'[i0.128,i2.32,128p_1604+i1.128] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (4, 128, 32, 128) %'input78'[p_1604,i0.128,i2.32,i1.128] # id=1278, src_id=None, , instances=4 # dl = tensor_op_name: _dot | hlo_id: 131 | [[i0.128];[i1.128, i2.32]] -> [[i0.128];[i1.128, i2.32]] +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 7.534us (512.000KiB, est bw: 69.590GB/s, 1.405% of tot. time) for bfloat16<128 x 128> non_local bfloat16 (4, 4, 128, 128) %'transpose.1'[T_i12_0_954,T_i12_1_954_0_1848_1849,i1.128,i0.128] = store bfloat16<128 x 128> TongaSB partitions[1] bfloat16 (4, 128, 512) %'950.1676'[T_i12_0_954,i1.128,i0.128+128T_i12_1_954_0_1848_1849] # id=1398, src_id=None, , instances=16 # dl = tensor_op_name: transpose.1_pftranspose_950 | hlo_id: 16 | [[i1.128];[i0.128]] -> [[i1.128];[i0.128]] +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 7.337us (1.000MiB, est bw: 142.912GB/s, 1.368% of tot. time) for bfloat16<32 x 4096> {'IntermediateTensor': ''}bfloat16 (128, 32, 128) %'intermediate1'(init=0.0)[32i0_0_0_992+i2.32,i0.32,i1.128] = store bfloat16<32 x 4096> TongaSB partitions[1] bfloat16 (4, 32, 32, 128) %'UnnamedModule.1678'[i0_0_0_992,i0.32,i2.32,i1.128] # id=1139, src_id=None, , instances=4 # dl = tensor_op_name: UnnamedModule | hlo_id: 1 | [[i0.32];[i1.128, i2.32]] -> [[i0.32];[i1.128, i2.32]] +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 4.953us (1.000MiB, est bw: 211.705GB/s, 0.924% of tot. time) for bfloat16<128 x 1024> non_local bfloat16 (524288,) %'dot.4-buffer-1868'[1024i122_0_0_0_1076_0+4096i0.128+i1.1024] = store bfloat16<128 x 1024> TongaSB partitions[1] bfloat16 (4, 128, 1024) %1077[i122_0_0_0_1076_0,i0.128,i1.1024] # id=1286, src_id=None, , instances=4 # dl = tensor_op_name: _dot.3 | hlo_id: 145 | [[i0.128];[i1.1024]] -> [[i0.128];[i1.1024]] +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 3.836us (512.000KiB, est bw: 136.670GB/s, 0.715% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[0] bfloat16 (128, 2048) %'transpose.1_pftranspose_950'[i0.128,i1.2048] = indirect_load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (151936, 2048) %'input76'[i0.128,i1.2048] generic generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[0] int32 (128, 1) %'gather.41.1674'[i0.128,0] # id=1135, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=1 # dl = tensor_op_name: _gather.41 | hlo_id: 16 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.308 seconds +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.005 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.008 seconds +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.010 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.016 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.075 seconds +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.036 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.014 seconds +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:57Z INFO 48705 [sg0002/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.031 seconds +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-08-07T13:53:57Z INFO 48703 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.013 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:57Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.001 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DeConcat]: Running DeConcat +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DeConcat]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DeConcat]: DeConcat finished after 0.001 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.001 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.011 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-08-07T13:53:58Z INFO 48705 [sg0002/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.013 seconds +2025-08-07T13:53:58Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:58Z INFO 48703 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 48703 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 48703 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.043 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48703 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.005 seconds +2025-08-07T13:53:58Z INFO 48703 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-08-07T13:53:58Z INFO 48703 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48703 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-08-07T13:53:58Z INFO 48703 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.011 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-08-07T13:53:58Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48703 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 48705 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.119 seconds +2025-08-07T13:53:58Z INFO 48705 [sg0002/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:53:58Z INFO 48705 [sg0002/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48703 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.046 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.011 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48703 [Tensorizer]: BirCodeGen estimate #instances=1466 in sg0000 +2025-08-07T13:53:58Z INFO 48703 [Tensorizer]: IR signature: 3964da3cc9cc122ff21f31e2c8bf756c8441b11f6ecb814c2922ac3920edc847 for nc00/sg0000/TensorizerBIR +2025-08-07T13:53:58Z INFO 48703 [Tensorizer]: Weights total number of bytes: 139520 +2025-08-07T13:53:58Z INFO 48703 [Tensorizer]: Successfully built model. +2025-08-07T13:53:58Z INFO 48705 [sg0002/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.011 seconds +2025-08-07T13:53:58Z INFO 48705 [sg0002/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.004 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.010 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.018 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.001 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/SpillPSum]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/SpillPSum]: SpillPSum finished after 0.013 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.032 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LegalizeType]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LegalizeType]: LegalizeType finished after 0.004 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.006 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.023 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/WeightCoalescing]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.015 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.005 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DataStreaming]: DataStreaming finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.111 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.004 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.005 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.001 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 25.347% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[3] bfloat16 (4, 2, 2, 128, 24, 512) %'input84_local_886'[i15_0_0_892_0,i15_0_0_1,c1_880,i0.128,i2.24,i1.128+128p_1295] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (8, 4, 2, 128, 24, 128) %'input84'[2i15_0_0_892_0+i15_0_0_1,p_1295,c1_880,i0.128,i2.24,i1.128] # id=1044, src_id=None, , instances=64 # dl = tensor_op_name: _dot.6 | hlo_id: 49 | [[i0.128];[i1.128, i2.24]] -> [[i0.128];[i1.128, i2.24]] +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 219.783us (48.000MiB, est bw: 229.006GB/s, 24.073% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 24, 128, 4096) %'input85_local_867'[i10_0_0,i10_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 24, 128, 4096) %'input85'[i10_0_0,i10_0_1,i0.128,i1.4096] # id=1035, src_id=None, , instances=48 # dl = tensor_op_name: _dot.4 | hlo_id: 39 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 219.783us (48.000MiB, est bw: 229.006GB/s, 24.073% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 24, 128, 4096) %'input87_local_876'[i12_0_0,i12_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 24, 128, 4096) %'input87'[i12_0_0,i12_0_1,i0.128,i1.4096] # id=1038, src_id=None, , instances=48 # dl = tensor_op_name: _dot.5 | hlo_id: 30 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 74.128us (16.000MiB, est bw: 226.329GB/s, 8.119% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (4, 4, 128, 4096) %'input94_local_906'[i41_0_1140,i25_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (4, 4, 128, 4096) %'input94'[i41_0_1140,i25_0_0_1,i0.128,i1.4096] # id=1058, src_id=None, , instances=16 # dl = tensor_op_name: _dot.9 | hlo_id: 67 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 74.128us (16.000MiB, est bw: 226.329GB/s, 8.119% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (4, 2, 128, 16, 512) %'input88_local_963'[i115_0_0_0_969_0,i115_0_0_0_1,i0.128,i3.16,i1.128+256p_1319+128i2.2] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 16, 2, 128) %'input88'[2i115_0_0_0_969_0+i115_0_0_0_1,p_1319,i0.128,i3.16,i2.2,i1.128] # id=1113, src_id=None, , instances=16 # dl = tensor_op_name: _dot.10 | hlo_id: 165 | [[i0.128];[i1.128, i2.2, i3.16]] -> [[i0.128];[i1.128, i2.2, i3.16]] +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 2.137% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (4, 128, 4096) %'input92_local_919'[i52_0_0_1504,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (4, 128, 4096) %'input92'[i52_0_0_1504,i0.128,i1.4096] # id=1079, src_id=None, , instances=4 # dl = tensor_op_name: _dot.8 | hlo_id: 102 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 2.137% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[0] bfloat16 (128, 32, 512) %'input89_local_952'[i0.128,i2.32,128p_1306+i1.128] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (4, 128, 32, 128) %'input89'[p_1306,i0.128,i2.32,i1.128] # id=1108, src_id=None, , instances=4 # dl = tensor_op_name: _dot.7 | hlo_id: 151 | [[i0.128];[i1.128, i2.32]] -> [[i0.128];[i1.128, i2.32]] +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 0.641% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[0] bfloat16 (128, 4096) %'816.1259'[i0.128,i1.4096] = load bfloat16<128 x 4096> non_local bfloat16 (128, 4096) %'add.4'[i0.128,i1.4096] # id=1141, src_id=None, , instances=1 # dl = tensor_op_name: add.4_pftranspose_816 | hlo_id: 17 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 0.641% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[0] bfloat16 (128, 4096) %'820.1264'[i0.128,i1.4096] = load bfloat16<128 x 4096> non_local bfloat16 (524288,) %'all_reduce.1-buffer-1533'[4096i0.128+i1.4096] # id=1150, src_id=None, , instances=1 # dl = tensor_op_name: all_reduce.1_pftranspose_820 | hlo_id: 52 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 4.953us (1.000MiB, est bw: 211.705GB/s, 0.543% of tot. time) for bfloat16<128 x 1024> non_local bfloat16 (524288,) %'dot.7-buffer-1531'[1024i15_0_0_892_0+4096i0.128+i1.1024] = store bfloat16<128 x 1024> TongaSB partitions[1] bfloat16 (4, 128, 1024) %893[i15_0_0_892_0,i0.128,i1.1024] # id=1047, src_id=None, , instances=4 # dl = tensor_op_name: _dot.6 | hlo_id: 49 | [[i0.128];[i1.1024]] -> [[i0.128];[i1.1024]] +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.004 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/OptimizeNKIKernels]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.018 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.010 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.004 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 48704 [sg0001/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.031 seconds +2025-08-07T13:53:58Z INFO 48704 [Tensorizer]: BirCodeGen estimate #instances=5137 in sg0001 +2025-08-07T13:53:58Z INFO 48704 [Tensorizer]: IR signature: d5acbe7a0b31f8ac95815686da7c851afebbcfdd5cbaa985cad9f402d630f11e for nc00/sg0001/TensorizerBIR +2025-08-07T13:53:58Z INFO 48704 [Tensorizer]: Weights total number of bytes: 139264 +2025-08-07T13:53:58Z INFO 48704 [Tensorizer]: Successfully built model. +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/SimplifyNeuronTensor]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 1.038 seconds +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.005 seconds +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DataStreaming]: DataStreaming finished after 0.037 seconds +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.288 seconds +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.014 seconds +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.014 seconds +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.009 seconds +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 2.705ms (594.000MiB, est bw: 230.258GB/s, 73.507% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (594, 128, 4096) %'695.1078'[i31_0,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (75968, 4096) %'input473'[128i31_0+i0.128,i1.4096] # id=1077, src_id=None, , instances=594 # dl = tensor_op_name: input473_pftranspose_695 | hlo_id: 90 | if -128i31_0-i0.128+75967 >= 0 [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 6.288% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[3] bfloat16 (4, 2, 2, 128, 24, 512) %'input469_local_766'[i15_0_0_772_0,i15_0_0_1,c1_760,i0.128,i2.24,i1.128+128p_2159] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (8, 4, 2, 128, 24, 128) %'input469'[2i15_0_0_772_0+i15_0_0_1,p_2159,c1_760,i0.128,i2.24,i1.128] # id=930, src_id=None, , instances=64 # dl = tensor_op_name: _dot.256 | hlo_id: 59 | [[i0.128];[i1.128, i2.24]] -> [[i0.128];[i1.128, i2.24]] +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 219.783us (48.000MiB, est bw: 229.006GB/s, 5.972% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 24, 128, 4096) %'input470_local_747'[i10_0_0,i10_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 24, 128, 4096) %'input470'[i10_0_0,i10_0_1,i0.128,i1.4096] # id=921, src_id=None, , instances=48 # dl = tensor_op_name: _dot.254 | hlo_id: 49 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 219.783us (48.000MiB, est bw: 229.006GB/s, 5.972% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 24, 128, 4096) %'input472_local_756'[i12_0_0,i12_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 24, 128, 4096) %'input472'[i12_0_0,i12_0_1,i0.128,i1.4096] # id=924, src_id=None, , instances=48 # dl = tensor_op_name: _dot.255 | hlo_id: 40 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 191.807us (297.000KiB, est bw: 1.586GB/s, 5.212% of tot. time) for float32<1 x 128> {'no_delinear': '0'}non_local float32 (1, 75968) %'convert.59'[0,128i31_0+i0.128] = store float32<1 x 128> TongaSB partitions[1] float32 (594, 1, 128) %'dot.257.1088'[i31_0,0,i0.128] # id=1086, src_id=None, , instances=594 # dl = tensor_op_name: _dot.257 | hlo_id: 90 | if -128i31_0-i0.128+75967 >= 0 [[];[i0.128]] -> [[];[i0.128]] +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 22.647us (296.758KiB, est bw: 13.418GB/s, 0.615% of tot. time) for float32<1 x 15194> TongaSB partitions[1] float32 (5, 1, 15194) %'custom-call.411.1157'[i1,0,i0.15194] = load float32<1 x 15194> {'no_delinear': '0'}non_local float32 (1, 75968) %'convert.59'[15194i1+i0.15194] # id=1152, src_id=None, , instances=5 # dl = tensor_op_name: _custom-call.411 | hlo_id: 93 | if -15194i1-i0.15194+75967 >= 0 [[];[i0.15194]] -> [[];[i0.15194]] +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 0.159% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[0] bfloat16 (128, 4096) %'699.2138'[i0.128,i1.4096] = load bfloat16<128 x 4096> non_local bfloat16 (128, 4096) %'add.9'[i0.128,i1.4096] # id=1052, src_id=None, , instances=1 # dl = tensor_op_name: add.9_pftranspose_699 | hlo_id: 27 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 0.159% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[0] bfloat16 (128, 4096) %'703.2143'[i0.128,i1.4096] = load bfloat16<128 x 4096> non_local bfloat16 (524288,) %'all_reduce.3-buffer-2750'[4096i0.128+i1.4096] # id=1061, src_id=None, , instances=1 # dl = tensor_op_name: all_reduce.3_pftranspose_703 | hlo_id: 62 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 4.953us (1.000MiB, est bw: 211.705GB/s, 0.135% of tot. time) for bfloat16<128 x 1024> non_local bfloat16 (524288,) %'dot.14-buffer-2748'[1024i15_0_0_772_0+4096i0.128+i1.1024] = store bfloat16<128 x 1024> TongaSB partitions[1] bfloat16 (4, 128, 1024) %773[i15_0_0_772_0,i0.128,i1.1024] # id=933, src_id=None, , instances=4 # dl = tensor_op_name: _dot.256 | hlo_id: 59 | [[i0.128];[i1.1024]] -> [[i0.128];[i1.1024]] +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 0.109% of tot. time) for bfloat16<128 x 4096> non_local bfloat16 (128, 32, 128) %'convert.57'[i0.128,i2.4+4i3.8,i1.128] = store bfloat16<128 x 4096> TongaSB partitions[0] bfloat16 (128, 8, 512) %'707.2553'[i0.128,i3.8,i1.128+128i2.4] # id=1065, src_id=None, , instances=1 # dl = tensor_op_name: convert.57_pftranspose_707 | hlo_id: 70 | [[i0.128];[i1.128, i2.4, i3.8]] -> [[i0.128];[i1.128, i2.4, i3.8]] +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.012 seconds +2025-08-07T13:53:59Z INFO 48705 [sg0002/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 48705 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.526 seconds +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.019 seconds +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-08-07T13:54:00Z WARNING 48705 [sg0002/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 79.86 percent of all matmul computation +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.013 seconds +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.106 seconds +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.013 seconds +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.085 seconds +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 48705 [sg0002/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-08-07T13:54:01Z INFO 48705 [sg0002/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 48705 [sg0002/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.462 seconds +2025-08-07T13:54:01Z INFO 48705 [Tensorizer]: BirCodeGen estimate #instances=96499 in sg0002 +2025-08-07T13:54:01Z INFO 48705 [Tensorizer]: IR signature: 0635d3edaf75e9c19039f712ad29cf07e67b796ed4064732e2e600d5c9f5e9ff for nc00/sg0002/TensorizerBIR +2025-08-07T13:54:01Z INFO 48705 [Tensorizer]: Weights total number of bytes: 135176 +2025-08-07T13:54:01Z INFO 48705 [Tensorizer]: Successfully built model. +2025-08-07T13:54:01Z USER 47514 [root/Tensorizer/Tensorizer]: Tensorizer finished after 9.739 seconds +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: End tensorization +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input76 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input0 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input79 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input83 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input82 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input1 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input81 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input80 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input78 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input77 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input4 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input2 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input5 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input86 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input87 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input85 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input84 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input90 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input94 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input93 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input92 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input91 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input89 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input88 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input6 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input2 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input7 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input471 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input472 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input470 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input469 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input474 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input1 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input473 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Network input: input3 +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: wrote bir.json +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: wrote tensor_map.json +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: wrote bir.json +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: wrote tensor_map.json +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: wrote bir.json +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: wrote tensor_map.json +2025-08-07T13:54:01Z INFO 47514 [job.Frontend.0]: Job #0 finished +2025-08-07T13:54:01Z INFO 47514 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-08-07T13:54:01Z INFO 47514 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-08-07T13:54:01Z INFO 47514 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-08-07T13:54:01Z INFO 47514 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-08-07T13:54:01Z INFO 47514 [job.WalrusDriver.0]: BackendDriver has 3 states with 1 core LNC +2025-08-07T13:54:01Z INFO 47514 [job.WalrusDriver.0]: BackendDriver MT cwd: /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/neuronxcc-vebk23i5 +2025-08-07T13:54:01Z INFO 47514 [job.BIRLinker.1]: Creating directory sgLnk/sg00 +2025-08-07T13:54:01Z INFO 47514 [job.WalrusDriver.0]: StateId sg00 Dir /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/neuronxcc-vebk23i5/sg00 +2025-08-07T13:54:01Z INFO 47514 [job.WalrusDriver.0]: StateId sg01 Dir /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/neuronxcc-vebk23i5/sg01 +2025-08-07T13:54:01Z INFO 47514 [job.WalrusDriver.0]: StateId sg02 Dir /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/neuronxcc-vebk23i5/sg02 +2025-08-07T13:54:01Z INFO 47514 [job.WalrusDriver.0]: Number of subgraphs to link: 3 +2025-08-07T13:54:01Z INFO 47514 [job.WalrusDriver.0]: lnkState: {"model": ["/home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/neuronxcc-vebk23i5/sgLnk/sg00", "state_id": "sgLnk"} +2025-08-07T13:54:01Z INFO 47514 [job.WalrusDriver.0]: BackendDriver in_state.num_states 3 with 1 core LNC +2025-08-07T13:54:01Z INFO 47514 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/log-neuron-cc.txt -o walrus_bir.out.json --enable-call-graph --enable-mt-backend --link-subgraphs sg00,sg01,sg02 --link-dir sgLnk/sg00 --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-lower-bound 0.14 --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --enable-internal-partitioner --dge-levels scalar_dynamic_offset,vector_dynamic_offsets,io --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.neff +2025-08-07T13:54:01Z INFO 47514 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/neuronxcc-vebk23i5 +2025-08-07T13:54:01Z INFO 47514 [job.WalrusDriver.0]: propagate_exit=True +2025-08-07T13:54:01Z INFO 47514 [job.WalrusDriver.0]: use_logger=False +2025-08-07T13:54:01Z INFO 47514 [job.WalrusDriver.0]: expose_stderr=True +2025-08-07T13:54:01Z INFO 49129 [Logging]: Logging to ../log-neuron-cc.txt at level 'INFO' +2025-08-07T13:54:01Z INFO 49129 [BackendDriver]: max_allowed_parallelism=128 +2025-08-07T13:54:01Z INFO 49129 [BackendDriver]: Loading module from sg00/bir.json +2025-08-07T13:54:01Z INFO 49129 [BackendDriver]: Loading module from sg01/bir.json +2025-08-07T13:54:01Z INFO 49129 [BackendDriver]: Loading module from sg02/bir.json +2025-08-07T13:54:01Z INFO 49129 [BackendDriver]: Backend driver mtBackend: true numModules: 3 Cwd: "/home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/neuronxcc-vebk23i5" +2025-08-07T13:54:01Z INFO 49129 [BackendDriver]: DynamicDMA is enabled +2025-08-07T13:54:01Z INFO 49129 [BackendDriver]: DynamicDMA levels being enabled: io, scalar_dynamic_offset, vector_dynamic_offsets, +2025-08-07T13:54:01Z INFO 49129 [BackendDriver]: Modular flow call graph is enabled +2025-08-07T13:54:01Z INFO 49129 [BackendDriver]: Internal partitioner is enabled +2025-08-07T13:54:01Z USER 49129 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:01Z INFO 49129 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=663 blocks=3 instructions=1178 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49129 (sg00) [ModuleForkPass]: Running do_nothing +2025-08-07T13:54:01Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=190 blocks=1 instructions=139 Max writers: 2 Max Readers: 10 +2025-08-07T13:54:01Z USER 49129 (sg00) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 67mb, ru_maxrss: 197mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 190 memory location(s), 1 block(s), and 139 instruction(s). Max writers: 2 Max Readers: 10 +2025-08-07T13:54:01Z USER 49129 (sg00) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:01Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=190 blocks=1 instructions=139 Max writers: 2 Max Readers: 10 +2025-08-07T13:54:01Z USER 49129 (sg01) [ModuleForkPass]: Running do_nothing +2025-08-07T13:54:01Z USER 49129 (sg02) [ModuleForkPass]: Running do_nothing +2025-08-07T13:54:01Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=135 blocks=1 instructions=75 Max writers: 2 Max Readers: 8 +2025-08-07T13:54:01Z USER 49129 (sg01) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 67mb, ru_maxrss: 197mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 135 memory location(s), 1 block(s), and 75 instruction(s). Max writers: 2 Max Readers: 8 +2025-08-07T13:54:01Z USER 49129 (sg01) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:01Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=135 blocks=1 instructions=75 Max writers: 2 Max Readers: 8 +2025-08-07T13:54:01Z WARNING 49129 [birverifier::InstVisitor]: (sg00) Non - output memory location with no reader: {convert.282.1710}@SB<0,0>(1x2)#Internal DebugInfo: +2025-08-07T13:54:01Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=338 blocks=1 instructions=964 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49129 (sg02) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 67mb, ru_maxrss: 197mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 338 memory location(s), 1 block(s), and 964 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49129 (sg02) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:01Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=338 blocks=1 instructions=964 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49129 (sg00) [ModuleForkPass]: birverifier finished after 0.003 seconds +2025-08-07T13:54:01Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 67mb, ru_maxrss: 197mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 190 memory location(s), 1 block(s), and 139 instruction(s). Max writers: 2 Max Readers: 10 +2025-08-07T13:54:01Z USER 49129 (sg01) [ModuleForkPass]: birverifier finished after 0.015 seconds +2025-08-07T13:54:01Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 90mb, ru_maxrss: 197mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 135 memory location(s), 1 block(s), and 75 instruction(s). Max writers: 2 Max Readers: 8 +2025-08-07T13:54:01Z USER 49129 (sg02) [ModuleForkPass]: birverifier finished after 0.134 seconds +2025-08-07T13:54:01Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 238mb, ru_maxrss: 238mb (delta=41mb) +2025-08-07T13:54:01Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 338 memory location(s), 1 block(s), and 964 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49129 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:01Z USER 49129 [BackendPassManager]: mod_parallel_pass finished after 0.136 seconds +2025-08-07T13:54:01Z INFO 49129 [BackendPassManager]: curr_vmrss: 230mb, ru_maxrss: 238mb (delta=41mb) +2025-08-07T13:54:01Z INFO 49129 [BackendPassManager]: Output has 3 module(s), 3 function(s), 663 memory location(s), 3 block(s), and 1178 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49129 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:01Z INFO 49129 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=663 blocks=3 instructions=1178 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49129 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:01Z INFO 49129 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=190 blocks=1 instructions=139 Max writers: 2 Max Readers: 10 +2025-08-07T13:54:01Z USER 49129 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49129 (sg00) [SubgraphForkPass]: curr_vmrss: 231mb, ru_maxrss: 238mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49129 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 190 memory location(s), 1 block(s), and 139 instruction(s). Max writers: 2 Max Readers: 10 +2025-08-07T13:54:01Z USER 49129 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:01Z USER 49129 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:01Z INFO 49129 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=135 blocks=1 instructions=75 Max writers: 2 Max Readers: 8 +2025-08-07T13:54:01Z USER 49129 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49129 (sg01) [SubgraphForkPass]: curr_vmrss: 231mb, ru_maxrss: 238mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49129 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 135 memory location(s), 1 block(s), and 75 instruction(s). Max writers: 2 Max Readers: 8 +2025-08-07T13:54:01Z INFO 49129 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=338 blocks=1 instructions=964 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49129 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49129 (sg02) [SubgraphForkPass]: curr_vmrss: 231mb, ru_maxrss: 238mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49129 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 338 memory location(s), 1 block(s), and 964 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49129 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:01Z USER 49129 [BackendPassManager]: subgraph_parallel_pass finished after 0.001 seconds +2025-08-07T13:54:01Z INFO 49129 [BackendPassManager]: curr_vmrss: 231mb, ru_maxrss: 238mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49129 [BackendPassManager]: Output has 3 module(s), 3 function(s), 663 memory location(s), 3 block(s), and 1178 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49129 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:01Z INFO 49129 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=663 blocks=3 instructions=1178 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49129 (sg00) [ModuleForkPass]: Running expand_replication +2025-08-07T13:54:01Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=190 blocks=1 instructions=139 Max writers: 2 Max Readers: 10 +2025-08-07T13:54:01Z INFO 49129 (sg00) [ExpandReplication]: Found 0 replicated matmults +2025-08-07T13:54:01Z USER 49129 (sg00) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 231mb, ru_maxrss: 238mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 190 memory location(s), 1 block(s), and 139 instruction(s). Max writers: 2 Max Readers: 10 +2025-08-07T13:54:01Z USER 49129 (sg00) [ModuleForkPass]: Running unroll +2025-08-07T13:54:01Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=190 blocks=1 instructions=139 Max writers: 2 Max Readers: 10 +2025-08-07T13:54:01Z INFO 49129 (sg00) [Unroll]: INFO (Unroll) Start unrolling at Thu Aug 7 13:54:01 2025 +2025-08-07T13:54:01Z USER 49129 (sg01) [ModuleForkPass]: Running expand_replication +2025-08-07T13:54:01Z USER 49129 (sg02) [ModuleForkPass]: Running expand_replication +2025-08-07T13:54:01Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=135 blocks=1 instructions=75 Max writers: 2 Max Readers: 8 +2025-08-07T13:54:01Z INFO 49129 (sg01) [ExpandReplication]: Found 0 replicated matmults +2025-08-07T13:54:01Z USER 49129 (sg01) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 231mb, ru_maxrss: 238mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 135 memory location(s), 1 block(s), and 75 instruction(s). Max writers: 2 Max Readers: 8 +2025-08-07T13:54:01Z USER 49129 (sg01) [ModuleForkPass]: Running unroll +2025-08-07T13:54:01Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=135 blocks=1 instructions=75 Max writers: 2 Max Readers: 8 +2025-08-07T13:54:01Z INFO 49129 (sg01) [Unroll]: INFO (Unroll) Start unrolling at Thu Aug 7 13:54:01 2025 +2025-08-07T13:54:01Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=338 blocks=1 instructions=964 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z INFO 49129 (sg02) [ExpandReplication]: Found 0 replicated matmults +2025-08-07T13:54:01Z USER 49129 (sg02) [ModuleForkPass]: expand_replication finished after 0.001 seconds +2025-08-07T13:54:01Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 232mb, ru_maxrss: 238mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 338 memory location(s), 1 block(s), and 964 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49129 (sg02) [ModuleForkPass]: Running unroll +2025-08-07T13:54:01Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=338 blocks=1 instructions=964 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z INFO 49129 (sg02) [Unroll]: INFO (Unroll) Start unrolling at Thu Aug 7 13:54:01 2025 +2025-08-07T13:54:01Z INFO 49129 (sg00) [Unroll]: INFO (Unroll) DONE unrolling Thu Aug 7 13:54:01 2025 + +2025-08-07T13:54:01Z INFO 49129 (sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-08-07T13:54:01Z INFO 49129 (sg00) [Unroll]: Total count: 1466 +2025-08-07T13:54:01Z INFO 49129 (sg00) [Unroll]: Matmult: 930 +2025-08-07T13:54:01Z INFO 49129 (sg00) [Unroll]: GenericCopy: 136 +2025-08-07T13:54:01Z INFO 49129 (sg00) [Unroll]: Load: 112 +2025-08-07T13:54:01Z INFO 49129 (sg00) [Unroll]: TensorTensor: 90 +2025-08-07T13:54:01Z INFO 49129 (sg00) [Unroll]: TensorScalarPtr: 82 +2025-08-07T13:54:01Z INFO 49129 (sg00) [Unroll]: Activation: 53 +2025-08-07T13:54:01Z INFO 49129 (sg00) [Unroll]: Save: 28 +2025-08-07T13:54:01Z INFO 49129 (sg00) [Unroll]: DMACopy: 10 +2025-08-07T13:54:01Z INFO 49129 (sg00) [Unroll]: Memset: 9 +2025-08-07T13:54:01Z INFO 49129 (sg00) [Unroll]: TensorReduce: 8 +2025-08-07T13:54:01Z INFO 49129 (sg00) [Unroll]: TensorScalarAffineSelect: 4 +2025-08-07T13:54:01Z INFO 49129 (sg00) [Unroll]: CollectiveCompute: 2 +2025-08-07T13:54:01Z INFO 49129 (sg00) [Unroll]: Reciprocal: 1 +2025-08-07T13:54:01Z INFO 49129 (sg00) [Unroll]: Iota: 1 +2025-08-07T13:54:01Z INFO 49129 (sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 9 +2025-08-07T13:54:01Z USER 49129 (sg00) [ModuleForkPass]: unroll finished after 0.016 seconds +2025-08-07T13:54:01Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 257mb (delta=19mb) +2025-08-07T13:54:01Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 562 memory location(s), 1 block(s), and 1466 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:01Z INFO 49129 (sg01) [Unroll]: INFO (Unroll) DONE unrolling Thu Aug 7 13:54:01 2025 + +2025-08-07T13:54:01Z INFO 49129 (sg01) [Unroll]: sg0001 Instruction count after Unroll: +2025-08-07T13:54:01Z INFO 49129 (sg01) [Unroll]: Total count: 5137 +2025-08-07T13:54:01Z INFO 49129 (sg01) [Unroll]: Matmult: 4467 +2025-08-07T13:54:01Z INFO 49129 (sg01) [Unroll]: Load: 213 +2025-08-07T13:54:01Z INFO 49129 (sg01) [Unroll]: GenericCopy: 126 +2025-08-07T13:54:01Z INFO 49129 (sg01) [Unroll]: TensorScalarPtr: 102 +2025-08-07T13:54:01Z INFO 49129 (sg01) [Unroll]: TensorTensor: 92 +2025-08-07T13:54:01Z INFO 49129 (sg01) [Unroll]: Activation: 92 +2025-08-07T13:54:01Z INFO 49129 (sg01) [Unroll]: Save: 10 +2025-08-07T13:54:01Z INFO 49129 (sg01) [Unroll]: DMACopy: 10 +2025-08-07T13:54:01Z INFO 49129 (sg01) [Unroll]: Memset: 10 +2025-08-07T13:54:01Z INFO 49129 (sg01) [Unroll]: TensorReduce: 8 +2025-08-07T13:54:01Z INFO 49129 (sg01) [Unroll]: Select: 4 +2025-08-07T13:54:01Z INFO 49129 (sg01) [Unroll]: CollectiveCompute: 2 +2025-08-07T13:54:01Z INFO 49129 (sg01) [Unroll]: Reciprocal: 1 +2025-08-07T13:54:01Z INFO 49129 (sg01) [Unroll]: Unrolled DGE count with Dynamic AP: 8 +2025-08-07T13:54:01Z USER 49129 (sg01) [ModuleForkPass]: unroll finished after 0.052 seconds +2025-08-07T13:54:01Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 292mb (delta=54mb) +2025-08-07T13:54:01Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 755 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: INFO (Unroll) DONE unrolling Thu Aug 7 13:54:01 2025 + +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: sg0002 Instruction count after Unroll: +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: Total count: 50754 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: Matmult: 42227 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: GenericCopy: 6026 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: Load: 786 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: Save: 614 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: Max: 224 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: MaxIndex: 224 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: MatchReplace: 217 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: TensorScalarPtr: 214 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: TensorTensor: 81 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: Activation: 69 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: Gather: 35 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: Memset: 12 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: TensorReduce: 8 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: StreamShuffle: 4 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: CollectiveCompute: 3 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: Select: 3 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: Reciprocal: 3 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: Iota: 2 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: DMACopy: 2 +2025-08-07T13:54:02Z INFO 49129 (sg02) [Unroll]: Unrolled DGE count with Dynamic AP: 1 +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: unroll finished after 0.485 seconds +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 518mb, ru_maxrss: 518mb (delta=280mb) +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9713 memory location(s), 1 block(s), and 50754 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:02Z USER 49129 [BackendPassManager]: mod_parallel_pass finished after 0.498 seconds +2025-08-07T13:54:02Z INFO 49129 [BackendPassManager]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=280mb) +2025-08-07T13:54:02Z INFO 49129 [BackendPassManager]: Output has 3 module(s), 3 function(s), 11030 memory location(s), 3 block(s), and 57357 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:02Z INFO 49129 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=11030 blocks=3 instructions=57357 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg00) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:02Z INFO 49129 (sg00) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=562 blocks=1 instructions=1466 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg01) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:02Z USER 49129 (sg02) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:02Z INFO 49129 (sg01) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=755 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49129 (sg02) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=9713 blocks=1 instructions=50754 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z USER 49129 (sg00) [SubgraphForkPass]: dead_code_elim finished after 0.002 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [SubgraphForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg01) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg01) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49129 (sg01) [SubgraphForkPass]: dead_code_elim finished after 0.004 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [SubgraphForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 704 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49129 (sg02) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg02) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg02) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49129 (sg02) [SubgraphForkPass]: dead_code_elim finished after 0.049 seconds +2025-08-07T13:54:02Z INFO 49129 (sg02) [SubgraphForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 9683 memory location(s), 1 block(s), and 50754 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:02Z USER 49129 [BackendPassManager]: subgraph_parallel_pass finished after 0.051 seconds +2025-08-07T13:54:02Z INFO 49129 [BackendPassManager]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 [BackendPassManager]: Output has 3 module(s), 3 function(s), 10905 memory location(s), 3 block(s), and 57356 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:02Z INFO 49129 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=10905 blocks=3 instructions=57356 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=704 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=9683 blocks=1 instructions=50754 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: birverifier finished after 0.002 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: birverifier finished after 0.006 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 704 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: birverifier finished after 0.047 seconds +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9683 memory location(s), 1 block(s), and 50754 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:02Z USER 49129 [BackendPassManager]: mod_parallel_pass finished after 0.049 seconds +2025-08-07T13:54:02Z INFO 49129 [BackendPassManager]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 [BackendPassManager]: Output has 3 module(s), 3 function(s), 10905 memory location(s), 3 block(s), and 57356 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:02Z INFO 49129 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=10905 blocks=3 instructions=57356 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:02Z INFO 49129 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [SubgraphForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z USER 49129 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:02Z USER 49129 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:02Z INFO 49129 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=704 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [SubgraphForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 704 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=9683 blocks=1 instructions=50754 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49129 (sg02) [SubgraphForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 9683 memory location(s), 1 block(s), and 50754 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:02Z USER 49129 [BackendPassManager]: subgraph_parallel_pass finished after 0.002 seconds +2025-08-07T13:54:02Z INFO 49129 [BackendPassManager]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 [BackendPassManager]: Output has 3 module(s), 3 function(s), 10905 memory location(s), 3 block(s), and 57356 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:02Z INFO 49129 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=10905 blocks=3 instructions=57356 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running instruction_reorder +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running instruction_reorder +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=704 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: Running instruction_reorder +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: instruction_reorder finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running psum_legalization +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=9683 blocks=1 instructions=50754 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running pre_opts +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreOpts]: Skipped. No pre-opt passes enabled +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running error_injector +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z WARNING 49129 (sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running vn_splitter +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: instruction_reorder finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 704 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running psum_legalization +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=704 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 704 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running legalize_cce_dma +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=704 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-08-07T13:54:02Z INFO 49129 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0 seconds +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: vn_splitter finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running constant_propagate +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 704 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running pre_opts +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=704 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreOpts]: Skipped. No pre-opt passes enabled +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 704 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running error_injector +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=704 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z WARNING 49129 (sg01) [ErrorInjector]: Unrecognized injected error value "0" +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 704 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running vn_splitter +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=704 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 1 +2025-08-07T13:54:02Z INFO 49129 (sg01) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49129 (sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49129 (sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-08-07T13:54:02Z INFO 49129 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-08-07T13:54:02Z INFO 49129 (sg01) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [VNSplitterPass]: INFO (VerticalFusion) Time: 0 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [VNSplitterPass]: INFO (ShrinkDN) Time: 0 seconds +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: vn_splitter finished after 0.002 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 704 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running constant_propagate +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=704 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: constant_propagate finished after 0.002 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running lower_ac +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg01) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running remat_optimization +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [RematOpt]: Removed 0 remat instructions +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: remat_optimization finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running early_peephole_opts +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-08-07T13:54:02Z INFO 49129 (sg00) [EarlyPeepholeOpts]: Activation Accumulate: 0 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: early_peephole_opts finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running infer_stream_ids +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running pre_sched +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 [LayerSpiller]: LayerSpill: Start... +2025-08-07T13:54:02Z INFO 49129 [LayerSpiller]: LayerSpill: Found 1 Splits CCs +2025-08-07T13:54:02Z INFO 49129 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-08-07T13:54:02Z INFO 49129 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-08-07T13:54:02Z INFO 49129 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-08-07T13:54:02Z INFO 49129 [LayerSpiller]: LayerSpill: Done. +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: Start split live ranges Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: No split opportunities: +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: End split live ranges Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: Strt remove redundncies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: remove_redundant_memsets +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: remove_redundant_memsets: 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: remove_redundant_loads +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: remove_redundant_loads: 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: End remove redundncies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: Start DCE Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49129 (sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: End DCE Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg01) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: Start build flow dependencies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg00) [build_flow_deps]: Start build fdeps. Invocation: 1Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg00) [build_flow_deps]: Allocs: 518 instructions: 1465 +2025-08-07T13:54:02Z INFO 49129 (sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49129 (sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: constant_propagate finished after 0.005 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 704 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running lower_ac +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=704 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 704 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running input_dma_coalescing +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=704 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: input_dma_coalescing finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 704 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running remat_optimization +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=704 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg00) [build_flow_deps]: Build fdeps inserted 3724 edges +2025-08-07T13:54:02Z INFO 49129 (sg00) [build_flow_deps]: Done build fdeps 3724 Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: End build flow dependencies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: Start remove useless insts Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: remove_useless_insts +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: remove Useless Instructions: 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: End remove useless insts Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: Start scratchpad optimization Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: End scratchpad optimization Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg01) [RematOpt]: Removed 0 remat instructions +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: remat_optimization finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 704 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running early_peephole_opts +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=704 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-08-07T13:54:02Z INFO 49129 (sg00) [PreSched]: DONE PRE scheduling Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: pre_sched finished after 0.007 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg01) [EarlyPeepholeOpts]: Activation Accumulate: 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: early_peephole_opts finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: instruction_reorder finished after 0.012 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 704 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=704 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 704 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running infer_stream_ids +2025-08-07T13:54:02Z INFO 49129 (sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=704 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 704 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running pre_sched +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=704 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: Start PRE scheduling 2 cores: 1 at: Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 [LayerSpiller]: LayerSpill: Start... +2025-08-07T13:54:02Z INFO 49129 (sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9683 memory location(s), 1 block(s), and 50754 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z INFO 49129 (sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: Running psum_legalization +2025-08-07T13:54:02Z INFO 49129 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-08-07T13:54:02Z INFO 49129 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 519 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=519 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 519 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=519 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=9683 blocks=1 instructions=50754 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z INFO 49129 (sg00) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:02Z INFO 49129 (sg00) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:02Z INFO 49129 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-08-07T13:54:02Z INFO 49129 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-08-07T13:54:02Z INFO 49129 [LayerSpiller]: LayerSpill: Done. +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: Start split live ranges Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: allocating PSUM +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: No split opportunities: +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: main loop +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: End split live ranges Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: Strt remove redundncies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: remove_redundant_memsets +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: renumber locations +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: size = 96 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: build_no_bitmap start +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: 50% PSUM demand before spilling +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: PSUM high-water mark = 4 tensors +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: found 77 edges +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: mean: 1.60417 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: median: 2.38206 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: adjacency vectors require 616 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: build_no_bitmap done +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: find costs +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: remove_redundant_memsets: 0 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: remove_redundant_loads +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: remove_redundant_loads: 0 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: End remove redundncies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: Start DCE Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: simplify interference graph +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: initialize low and high +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: lo = 96 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: hi = 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: inf = 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: total = 96 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: simplify +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: new candidates = 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: select ranges +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: End DCE Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: no more spills +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-08-07T13:54:02Z INFO 49129 (sg00) [PSUM_Allocator]: 50% PSUM utilization after allocation +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.004 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 519 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=519 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 519 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running address_rotation_psum +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=519 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 19 PSUM Banks +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: Start build flow dependencies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg01) [build_flow_deps]: Start build fdeps. Invocation: 2Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 4 PSUM Banks +2025-08-07T13:54:02Z INFO 49129 (sg01) [build_flow_deps]: Allocs: 704 instructions: 5137 +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: psum_legalization finished after 0.006 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 32 PSUM Banks +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: address_rotation_psum finished after 0.002 seconds +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 519 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=519 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 76338944 +2025-08-07T13:54:02Z INFO 49129 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 6933 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 2703362 +2025-08-07T13:54:02Z INFO 49129 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 879 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 791040 +2025-08-07T13:54:02Z INFO 49129 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 343 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:02Z INFO 49129 (sg00) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9683 memory location(s), 1 block(s), and 50754 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: Running legalize_cce_dma +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: allocating SB +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: main loop +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: renumber locations +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: size = 394 +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=9683 blocks=1 instructions=50754 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: find partners +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: found 62 accumulation groups +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: largest = _dot-t1112 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: tensors = 33 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: requires 40960 bytes/partition +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: expanding partners +2025-08-07T13:54:02Z INFO 49129 []: find first defs for local +2025-08-07T13:54:02Z INFO 49129 []: find first defs for global +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: find loads +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: 1 pin count +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: 89 remat count +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: build interference graph +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: pass 1 int-tree +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: Num intervals 394 Num locations 394 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: info.neighbors init Done +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: edge: 9861 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: mean: 50.0558 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: median: 59.0874 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: find costs +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: simplify interference graph +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: safe = 279 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: unsafe = 100 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: inf = 14 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: total = 393 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: simplify +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 90 #Pinned 0 #Safe 0 minCost 0.00254569 maxCost 0.0530634 locations 394 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: new candidates = 14 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: select ranges +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: legalize_cce_dma finished after 0.007 seconds +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: Total: 393 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: Allocated: 1.000 (393) +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: Rover zone: 0.735 (289) +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: Pre-rover zone: 0.122 (48) +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: Post-rover zone: 0.142 (56) +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: Blocks nothing: 0.084 (33) +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: Blocks medium: 0.038 (15) +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: Visited until medium blocking (mean): 0.328 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: Visited until medium blocking (median): 0.353 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: Visited until medium blocking (p95): 0.434 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: Blocks tall: 0.878 (345) +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.895 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: Success +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9683 memory location(s), 1 block(s), and 50754 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: Running pre_opts +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: SB spills = 0 tensors +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: remats = 0 tensors +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: SB score = 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-08-07T13:54:02Z INFO 49129 (sg01) [build_flow_deps]: Build fdeps inserted 15463 edges +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: pinning saved approximately 9010 cycles +2025-08-07T13:54:02Z INFO 49129 (sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-08-07T13:54:02Z INFO 49129 (sg01) [build_flow_deps]: Done build fdeps 15463 Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: End build flow dependencies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: Start remove useless insts Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: remove_useless_insts +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=9683 blocks=1 instructions=50754 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z INFO 49129 (sg02) [PreOpts]: Skipped. No pre-opt passes enabled +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 76338944 +2025-08-07T13:54:02Z INFO 49129 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 6933 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 2703362 +2025-08-07T13:54:02Z INFO 49129 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 879 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 791040 +2025-08-07T13:54:02Z INFO 49129 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 343 bytes +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.009 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 519 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=519 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9683 memory location(s), 1 block(s), and 50754 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: Running error_injector +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: remove Useless Instructions: 0 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: End remove useless insts Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: Start scratchpad optimization Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=9683 blocks=1 instructions=50754 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z WARNING 49129 (sg02) [ErrorInjector]: Unrecognized injected error value "0" +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: End scratchpad optimization Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: address_rotation_sb finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 519 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=519 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9683 memory location(s), 1 block(s), and 50754 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: Running vn_splitter +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 79042306, 53.1233% input load, 1.43024% output write, 45.4464% spill/reload [sg0000] +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=9683 blocks=1 instructions=50754 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: removed 0 identical load +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:02Z INFO 49129 (sg02) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 14 +2025-08-07T13:54:02Z INFO 49129 (sg02) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(4.19899e+07) +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49129 (sg01) [PreSched]: DONE PRE scheduling Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: pre_sched finished after 0.020 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 704 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=704 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: average loaded DMA size 6933 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: average saved DMA size 879 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 76338944 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 6933 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 2703362 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 879 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 0, 0% out of total dma traffic +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 79042306, 53.1233% input load, 1.43024% output write, 45.4464% spill/reload [sg0000] +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 76338944 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 6933 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 2703362 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 879 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 791040 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 343 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 4870 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.005 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:02Z INFO 49129 (sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: SB Rotation rotated 3 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg01) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg01) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: SB Rotation rotated 12 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg01) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.003 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 704 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running dynamic_dma_setup +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=704 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 705 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running runtime_memory_reservation +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=705 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 705 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running coloring_allocator_psum +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=705 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:02Z INFO 49129 (sg01) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: SB Rotation rotated 14 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: allocating PSUM +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: main loop +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: renumber locations +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: size = 192 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: SB Rotation rotated 23 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: build_no_bitmap start +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: found 144 edges +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: mean: 1.5 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: median: 0.781706 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: adjacency vectors require 1152 bytes +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: build_no_bitmap done +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: find costs +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: address_rotation_sb finished after 0.005 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:02Z INFO 49129 (sg00) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: reserved space = 670693126 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: spill space = 3670016 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: aligned spill space = 3670016 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: renumber locations +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: size = 4 +2025-08-07T13:54:02Z INFO 49129 []: find first defs for local +2025-08-07T13:54:02Z INFO 49129 []: find first defs for global +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: Num intervals 4 Num locations 4 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: simplify interference graph +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: initialize low and high +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: lo = 4 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: hi = 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: total = 4 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: simplify +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: select ranges +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: allreduce_dram_hwm 3670016 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: Real CC buffer size 3670016 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: DRAM hwm after allocation: 3670016 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running address_rotation_dram +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: DRAM hwm before rotation 3670016 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: allreduce hwm 3670016 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: Real CC buffer size 3670016 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: DRAM hwm after rotation 3670016 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: address_rotation_dram finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-08-07T13:54:02Z INFO 49129 (sg00) [TensorCopyAccel::Impl]: Accelerated 0 out of 144 tensorcopy in Function: sg0000 average acceleration factor: -nan +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running peephole_opts +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: peephole_opts finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running lower_kernel +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [LowerKernel]: Started running LowerKernel +2025-08-07T13:54:02Z INFO 49129 (sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 1465, number of allocs: 518 +2025-08-07T13:54:02Z INFO 49129 (sg00) [LowerKernel]: Scan BKs time (s): 6.3e-05 +2025-08-07T13:54:02Z INFO 49129 (sg00) [LowerKernel]: Lower BKs time (s): 1.1e-05 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: lower_kernel finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: birverifier finished after 0.002 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running build_fdeps +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [build_flow_deps]: Start build fdeps. Invocation: 3Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg00) [build_flow_deps]: Allocs: 518 instructions: 1465 +2025-08-07T13:54:02Z INFO 49129 (sg00) [build_flow_deps]: Build fdeps inserted 3724 edges +2025-08-07T13:54:02Z INFO 49129 (sg00) [build_flow_deps]: Done build fdeps 3724 Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: build_fdeps finished after 0.003 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running remove_redundancies +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-08-07T13:54:02Z INFO 49129 (sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [RemoveRedundancies]: remove_useless_insts +2025-08-07T13:54:02Z INFO 49129 (sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: remove_redundancies finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:02Z INFO 49129 (sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:02Z INFO 49129 (sg00) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: simplify interference graph +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: initialize low and high +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: lo = 192 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: hi = 0 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: inf = 0 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: total = 192 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: simplify +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: new candidates = 0 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: select ranges +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: no more spills +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-08-07T13:54:02Z INFO 49129 (sg01) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: coloring_allocator_psum finished after 0.019 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 705 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running dma_optimization_psum +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=705 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: dma_optimization_psum finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.009 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 705 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running address_rotation_psum +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=705 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg02) [ShrinkDN]: INFO (ShrinkDN): Shrunk 2 nodes. Total savings 14336 bytes/partition +2025-08-07T13:54:02Z INFO 49129 (sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:02Z INFO 49129 (sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running post_sched +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 [post_scheduler]: Start PosT ScheD 3 sunda Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 19 PSUM Banks +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 4 PSUM Banks +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 13 PSUM Banks +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: address_rotation_psum finished after 0.011 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 705 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running coloring_allocator_sb +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=705 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 195236352 +2025-08-07T13:54:02Z INFO 49129 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 7160 bytes +2025-08-07T13:54:02Z INFO 49129 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 3145730 +2025-08-07T13:54:02Z INFO 49129 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2728 bytes +2025-08-07T13:54:02Z INFO 49129 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 266240 +2025-08-07T13:54:02Z INFO 49129 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-08-07T13:54:02Z INFO 49129 (sg01) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:02Z INFO 49129 (sg01) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: allocating SB +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: main loop +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: renumber locations +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: size = 480 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: find partners +2025-08-07T13:54:02Z INFO 49129 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-08-07T13:54:02Z INFO 49129 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: found 180 accumulation groups +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: largest = _dot.6-t1000_i5 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: tensors = 50 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: requires 61440 bytes/partition +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: expanding partners +2025-08-07T13:54:02Z INFO 49129 (sg02) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-08-07T13:54:02Z INFO 49129 (sg02) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.011 seconds +2025-08-07T13:54:02Z INFO 49129 (sg02) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.015 seconds +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: vn_splitter finished after 0.041 seconds +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9683 memory location(s), 1 block(s), and 50754 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: Running constant_propagate +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=9683 blocks=1 instructions=50754 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z INFO 49129 []: find first defs for local +2025-08-07T13:54:02Z INFO 49129 []: find first defs for global +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: find loads +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: 1 pin count +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: 127 remat count +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: build interference graph +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: pass 1 int-tree +2025-08-07T13:54:02Z INFO 49129 [post_scheduler]: Time-aware hwm post-sched +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: Num intervals 480 Num locations 480 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: info.neighbors init Done +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: edge: 18048 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: mean: 75.2 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: median: 69.4763 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: find costs +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: simplify interference graph +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: safe = 209 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: unsafe = 152 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: inf = 118 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: total = 479 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: simplify +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 118 #Pinned 0 #Safe 0 minCost 0.00361083 maxCost 0.0825967 locations 480 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: new candidates = 95 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: select ranges +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: Total: 479 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: Spilled: 0.000 (0) +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: Allocated: 1.000 (479) +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: Rover zone: 0.733 (351) +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: Pre-rover zone: 0.006 (3) +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: Post-rover zone: 0.261 (125) +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: Blocks tall: 1.000 (479) +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: Visited until tall blocking (mean): 0.998 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: Success +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: SB spills = 0 tensors +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: remats = 0 tensors +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: SB score = 0 +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: pinning saved approximately 9010 cycles +2025-08-07T13:54:02Z INFO 49129 (sg01) [SB_Allocator]: 0% SB utilization after allocation +2025-08-07T13:54:02Z INFO 49129 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 195236352 +2025-08-07T13:54:02Z INFO 49129 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 7160 bytes +2025-08-07T13:54:02Z INFO 49129 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 3145730 +2025-08-07T13:54:02Z INFO 49129 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2728 bytes +2025-08-07T13:54:02Z INFO 49129 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 266240 +2025-08-07T13:54:02Z INFO 49129 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: coloring_allocator_sb finished after 0.013 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 366mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 705 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=705 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg02) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: address_rotation_sb finished after 0.002 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 366mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 705 memory location(s), 1 block(s), and 5137 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running dma_optimization_sb +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=705 blocks=1 instructions=5137 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 198382082, 97.3572% input load, 0.528565% output write, 2.11426% spill/reload [sg0001] +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: removed 0 identical load +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:02Z INFO 49129 [post_scheduler]: Time-aware simulation time: 417023 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: sub-graph will get execute 35 times +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 65536, 0.0330352% out of total dma traffic(1.93139e+08) +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49129 [post_scheduler]: Done PosT ScheD Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: post_sched finished after 0.029 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 366mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 366mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: average loaded DMA size 7226 bytes +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: average saved DMA size 2728 bytes +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 195170816 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 7226 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 40 PSUM Banks +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 3145730 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2728 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 27 PSUM Banks +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 65536, 0.0330352% out of total dma traffic +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 198316546, 97.3563% input load, 0.52874% output write, 2.11495% spill/reload [sg0001] +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 195170816 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 7226 bytes +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 3145730 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2728 bytes +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 19 PSUM Banks +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 266240 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 130 bytes +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 6573 bytes +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: dma_optimization_sb finished after 0.012 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 366mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5135 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=702 blocks=1 instructions=5135 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: SB Rotation rotated 3 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: SB Rotation rotated 4 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: SB Rotation rotated 20 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: SB Rotation rotated 71 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: SB Rotation rotated 44 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: address_rotation_sb finished after 0.013 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: SB Rotation rotated 11 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 366mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:02Z INFO 49129 (sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:02Z INFO 49129 (sg00) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: SB Rotation rotated 9 Sb address +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.006 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:02Z INFO 49129 (sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-08-07T13:54:02Z INFO 49129 (sg00) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: address_rotation_sb finished after 0.013 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5135 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running coloring_allocator_dram +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=702 blocks=1 instructions=5135 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:02Z INFO 49129 (sg01) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running dep_opt +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [build_flow_deps]: Start build fdeps. Invocation: 4Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg00) [build_flow_deps]: Allocs: 518 instructions: 1465 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: reserved space = 201462280 bytes +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: spill space = 5242880 bytes +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: aligned spill space = 5242880 bytes +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: renumber locations +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: size = 5 +2025-08-07T13:54:02Z INFO 49129 []: find first defs for local +2025-08-07T13:54:02Z INFO 49129 []: find first defs for global +2025-08-07T13:54:02Z INFO 49129 (sg00) [build_flow_deps]: Build fdeps inserted 3649 edges +2025-08-07T13:54:02Z INFO 49129 (sg00) [build_flow_deps]: Done build fdeps 3649 Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: dep_opt finished after 0.003 seconds +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: Running report_stats +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: Num intervals 5 Num locations 5 +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: simplify interference graph +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: initialize low and high +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: lo = 5 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: hi = 0 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: total = 5 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: simplify +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: select ranges +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: allreduce_dram_hwm 4194304 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: Real CC buffer size 4194304 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: DRAM hwm after allocation: 5242880 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:54:02Z INFO 49129 (sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌──────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 622329856 │ +│ DMACopy │ Internal -> ExternalOutput │ 8 │ 8388608 │ +│ DMACopy │ Internal -> Output │ 1 │ 2097152 │ +│ Load │ Const -> Internal │ 3 │ 37120 │ +│ Load │ ExternalInput -> Internal │ 45 │ 41952768 │ +│ Load │ Internal │ 32 │ 1048576 │ +│ Load (Spill) │ Internal │ 32 │ 33300480 │ +│ Save │ Internal │ 20 │ 1572864 │ +│ Save │ Internal -> Output │ 8 │ 1130498 │ +└──────────────┴────────────────────────────┴───────┴───────────┘ + +2025-08-07T13:54:02Z INFO 49129 (sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 3 │ +│ 4 │ 1 │ +│ 32 │ 1 │ +│ 64 │ 1 │ +│ 128 │ 1 │ +│ 256 │ 52 │ +│ 512 │ 1 │ +│ 2048 │ 4 │ +│ 4096 │ 1 │ +│ 8192 │ 44 │ +│ 32520 │ 32 │ +│ 262144 │ 8 │ +│ 1048576 │ 2 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:02Z INFO 49129 (sg00) [ReportStats]: MM Stats: #MatMults 930 #MatMult-Transposes 72 +2025-08-07T13:54:02Z INFO 49129 (sg00) [ReportStats]: IO Tensor size combined: 668476932 +2025-08-07T13:54:02Z INFO 49129 (sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input76 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input77 │ ExternalInput │ bfloat16 │ 16777216 │ +│ input83 │ ExternalInput │ bfloat16 │ 16777216 │ +│ input78 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input81 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input5 │ ExternalInput │ bfloat16 │ 1048576 │ +│ input4 │ ExternalInput │ bfloat16 │ 1048576 │ +│ output1 │ ExternalOutput │ bfloat16 │ 1048576 │ +│ output2 │ ExternalOutput │ bfloat16 │ 1048576 │ +│ input79 │ ExternalInput │ bfloat16 │ 8192 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: coloring_allocator_dram finished after 0.004 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────┼──────────┼──────────┼──────────────┤ +│ input78_local_1059 │ Internal │ bfloat16 │ 4194304 │ +│ input77_local_1070_i6 │ Internal │ bfloat16 │ 2097152 │ +│ input77_local_1070_i4 │ Internal │ bfloat16 │ 2097152 │ +│ input77_local_1070_i3 │ Internal │ bfloat16 │ 2097152 │ +│ input77_local_1070_i5 │ Internal │ bfloat16 │ 2097152 │ +│ input77_local_1070_i1 │ Internal │ bfloat16 │ 2097152 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input77_local_1070_i7 │ Internal │ bfloat16 │ 2097152 │ +│ input77_local_1070_i2 │ Internal │ bfloat16 │ 2097152 │ +│ input77_local_1070_i0 │ Internal │ bfloat16 │ 2097152 │ +└───────────────────────┴──────────┴──────────┴──────────────┘ + +2025-08-07T13:54:02Z USER 49129 (sg00) [ModuleForkPass]: report_stats finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5135 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running address_rotation_dram +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=702 blocks=1 instructions=5135 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: Runtime page size at 512MB +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: DRAM hwm before rotation 5242880 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: allreduce hwm 4194304 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: Real CC buffer size 4194304 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: DRAM hwm after rotation 5242880 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: address_rotation_dram finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5135 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running tensorcopy_accel +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=702 blocks=1 instructions=5135 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-08-07T13:54:02Z INFO 49129 (sg01) [TensorCopyAccel::Impl]: Accelerated 4 out of 136 tensorcopy in Function: sg0001 average acceleration factor: 1 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5135 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running peephole_opts +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=702 blocks=1 instructions=5135 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: peephole_opts finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running lower_kernel +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [LowerKernel]: Started running LowerKernel +2025-08-07T13:54:02Z INFO 49129 (sg01) [LowerKernel]: Start of kernel lowering pass, number of insts: 5139, number of allocs: 702 +2025-08-07T13:54:02Z INFO 49129 (sg01) [LowerKernel]: Scan BKs time (s): 0.001154 +2025-08-07T13:54:02Z INFO 49129 (sg01) [LowerKernel]: Lower BKs time (s): 5e-06 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: lower_kernel finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: birverifier finished after 0.003 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running dynamic_dma_scan +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: dynamic_dma_scan finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running build_fdeps +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [build_flow_deps]: Start build fdeps. Invocation: 5Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg01) [build_flow_deps]: Allocs: 702 instructions: 5139 +2025-08-07T13:54:02Z INFO 49129 (sg01) [build_flow_deps]: Build fdeps inserted 15483 edges +2025-08-07T13:54:02Z INFO 49129 (sg01) [build_flow_deps]: Done build fdeps 15483 Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: build_fdeps finished after 0.009 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running remove_redundancies +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [RemoveRedundancies]: remove_clobbered_writes +2025-08-07T13:54:02Z INFO 49129 (sg01) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-08-07T13:54:02Z INFO 49129 (sg01) [RemoveRedundancies]: remove_useless_insts +2025-08-07T13:54:02Z INFO 49129 (sg01) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: remove_redundancies finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:02Z INFO 49129 (sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:02Z INFO 49129 (sg01) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:02Z INFO 49129 (sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg02) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.033 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 368mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:02Z INFO 49129 (sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.003 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 369mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 369mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running post_sched +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 [post_scheduler]: Start PosT ScheD 3 sunda Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49129 (sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: constant_propagate finished after 0.147 seconds +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 370mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9683 memory location(s), 1 block(s), and 50754 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: Running lower_ac +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=9683 blocks=1 instructions=50754 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z INFO 49129 [post_scheduler]: Time-aware hwm post-sched +2025-08-07T13:54:02Z INFO 49129 (sg02) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: lower_ac finished after 0.009 seconds +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 371mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9683 memory location(s), 1 block(s), and 50754 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: Running input_dma_coalescing +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=9683 blocks=1 instructions=50754 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z INFO 49129 (sg02) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: input_dma_coalescing finished after 0.019 seconds +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 372mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9683 memory location(s), 1 block(s), and 50754 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: Running remat_optimization +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=9683 blocks=1 instructions=50754 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z INFO 49129 [post_scheduler]: Time-aware simulation time: 33366410 +2025-08-07T13:54:02Z INFO 49129 [post_scheduler]: Done PosT ScheD Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: post_sched finished after 0.091 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 372mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running expand_scheduling_units +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 372mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 43 PSUM Banks +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 16 PSUM Banks +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 2 PSUM Banks +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: SB Rotation rotated 27 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: SB Rotation rotated 4 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: SB Rotation rotated 7 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z INFO 49129 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: address_rotation_sb finished after 0.037 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 372mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:02Z INFO 49129 (sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:02Z INFO 49129 (sg01) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.021 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:02Z INFO 49129 (sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-08-07T13:54:02Z INFO 49129 (sg01) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.002 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running dep_opt +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [build_flow_deps]: Start build fdeps. Invocation: 6Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg01) [build_flow_deps]: Allocs: 702 instructions: 5139 +2025-08-07T13:54:02Z INFO 49129 (sg01) [build_flow_deps]: Build fdeps inserted 15350 edges +2025-08-07T13:54:02Z INFO 49129 (sg01) [build_flow_deps]: Done build fdeps 15350 Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: dep_opt finished after 0.014 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: Running report_stats +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg01) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 3145728 │ +│ DMACopy │ Internal -> ExternalOutput │ 8 │ 8388608 │ +│ DMACopy │ Internal -> Output │ 1 │ 2097152 │ +│ Load │ Const -> Internal │ 2 │ 36864 │ +│ Load │ ExternalInput -> Internal │ 204 │ 192954880 │ +│ Load │ Input -> Internal │ 3 │ 81920 │ +│ Load │ Internal │ 2 │ 2097152 │ +│ Save │ Internal │ 8 │ 2097152 │ +│ Save │ Internal -> Output │ 2 │ 1048578 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-08-07T13:54:02Z INFO 49129 (sg01) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 3 │ +│ 32 │ 1 │ +│ 64 │ 2 │ +│ 128 │ 1 │ +│ 256 │ 3 │ +│ 2048 │ 8 │ +│ 6144 │ 64 │ +│ 8192 │ 139 │ +│ 262144 │ 8 │ +│ 1048576 │ 5 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:02Z INFO 49129 (sg01) [ReportStats]: MM Stats: #MatMults 4467 #MatMult-Transposes 122 +2025-08-07T13:54:02Z INFO 49129 (sg01) [ReportStats]: IO Tensor size combined: 197149188 +2025-08-07T13:54:02Z INFO 49129 (sg01) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input87 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input84 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input85 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input88 │ ExternalInput │ bfloat16 │ 16777216 │ +│ input94 │ ExternalInput │ bfloat16 │ 16777216 │ +│ input92 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input89 │ ExternalInput │ bfloat16 │ 4194304 │ +│ output4 │ ExternalOutput │ bfloat16 │ 1048576 │ +│ input7 │ ExternalInput │ bfloat16 │ 1048576 │ +│ input6 │ ExternalInput │ bfloat16 │ 1048576 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-08-07T13:54:02Z INFO 49129 (sg01) [ReportStats]: Large (Internal) Tensor Statistics: +┌──────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├──────────────────────┼──────────┼──────────┼──────────────┤ +│ input89_local_952 │ Internal │ bfloat16 │ 4194304 │ +│ input84_local_886_i6 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_886_i3 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_886_i2 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_886_i4 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_886_i5 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_886_i8 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_886_i7 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_886_i1 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_886_i0 │ Internal │ bfloat16 │ 3145728 │ +└──────────────────────┴──────────┴──────────┴──────────────┘ + +2025-08-07T13:54:02Z USER 49129 (sg01) [ModuleForkPass]: report_stats finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:02Z INFO 49129 (sg02) [RematOpt]: Removed 0 remat instructions +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: remat_optimization finished after 0.089 seconds +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9683 memory location(s), 1 block(s), and 50754 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: Running early_peephole_opts +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=9683 blocks=1 instructions=50754 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z INFO 49129 (sg02) [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-08-07T13:54:02Z INFO 49129 (sg02) [EarlyPeepholeOpts]: Activation Accumulate: 0 +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: early_peephole_opts finished after 0.023 seconds +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9683 memory location(s), 1 block(s), and 50754 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=9683 blocks=1 instructions=50754 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.007 seconds +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9683 memory location(s), 1 block(s), and 50754 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: Running infer_stream_ids +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=9683 blocks=1 instructions=50754 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: infer_stream_ids finished after 0.006 seconds +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9683 memory location(s), 1 block(s), and 50754 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z USER 49129 (sg02) [ModuleForkPass]: Running pre_sched +2025-08-07T13:54:02Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=9683 blocks=1 instructions=50754 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:02Z INFO 49129 (sg02) [PreSched]: Start PRE scheduling 2 cores: 1 at: Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 [LayerSpiller]: LayerSpill: Start... +2025-08-07T13:54:02Z INFO 49129 [LayerSpiller]: LayerSpill: Found 1 Splits CCs +2025-08-07T13:54:02Z INFO 49129 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-08-07T13:54:02Z INFO 49129 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-08-07T13:54:02Z INFO 49129 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-08-07T13:54:02Z INFO 49129 [LayerSpiller]: LayerSpill: Done. +2025-08-07T13:54:02Z INFO 49129 (sg02) [PreSched]: Start split live ranges Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg02) [PreSched]: Num_Splits: 1 +2025-08-07T13:54:02Z INFO 49129 (sg02) [PreSched]: End split live ranges Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg02) [PreSched]: Strt remove redundncies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg02) [PreSched]: remove_redundant_memsets +2025-08-07T13:54:02Z INFO 49129 (sg02) [PreSched]: remove_redundant_memsets: 0 +2025-08-07T13:54:02Z INFO 49129 (sg02) [PreSched]: remove_redundant_loads +2025-08-07T13:54:02Z INFO 49129 (sg02) [PreSched]: remove_redundant_loads: 0 +2025-08-07T13:54:02Z INFO 49129 (sg02) [PreSched]: End remove redundncies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg02) [PreSched]: Start DCE Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg02) [PreSched]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49129 (sg02) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg02) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg02) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49129 (sg02) [PreSched]: End DCE Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg02) [PreSched]: Start build flow dependencies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg02) [build_flow_deps]: Start build fdeps. Invocation: 7Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49129 (sg02) [build_flow_deps]: Allocs: 9685 instructions: 50756 +2025-08-07T13:54:03Z INFO 49129 (sg02) [build_flow_deps]: Build fdeps inserted 177894 edges +2025-08-07T13:54:03Z INFO 49129 (sg02) [build_flow_deps]: Done build fdeps 177894 Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49129 (sg02) [PreSched]: End build flow dependencies Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49129 (sg02) [PreSched]: Start remove useless insts Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49129 (sg02) [PreSched]: remove_useless_insts +2025-08-07T13:54:03Z INFO 49129 (sg02) [PreSched]: remove Useless Instructions: 0 +2025-08-07T13:54:03Z INFO 49129 (sg02) [PreSched]: End remove useless insts Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49129 (sg02) [PreSched]: Start scratchpad optimization Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49129 (sg02) [PreSched]: End scratchpad optimization Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49129 (sg02) [PreSched]: DONE PRE scheduling Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z USER 49129 (sg02) [ModuleForkPass]: pre_sched finished after 0.505 seconds +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 381mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9685 memory location(s), 1 block(s), and 50756 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:03Z USER 49129 (sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=9685 blocks=1 instructions=50756 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:03Z INFO 49129 (sg02) [TensorCopyElim]: Tensor CP elimination: 1 +2025-08-07T13:54:03Z INFO 49129 (sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:03Z INFO 49129 (sg02) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49129 (sg02) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49129 (sg02) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z USER 49129 (sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.120 seconds +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 381mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9684 memory location(s), 1 block(s), and 50755 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:03Z USER 49129 (sg02) [ModuleForkPass]: Running dynamic_dma_setup +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=9684 blocks=1 instructions=50755 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:03Z USER 49129 (sg02) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 381mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9685 memory location(s), 1 block(s), and 50755 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:03Z USER 49129 (sg02) [ModuleForkPass]: Running runtime_memory_reservation +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=9685 blocks=1 instructions=50755 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:03Z USER 49129 (sg02) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 381mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9685 memory location(s), 1 block(s), and 50755 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:03Z USER 49129 (sg02) [ModuleForkPass]: Running coloring_allocator_psum +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=9685 blocks=1 instructions=50755 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:03Z INFO 49129 (sg02) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:03Z INFO 49129 (sg02) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: allocating PSUM +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: main loop +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: renumber locations +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: size = 6075 +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: build_no_bitmap start +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: found 16718 edges +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: mean: 5.50387 +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: median: 6.99538 +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: adjacency vectors require 133744 bytes +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: build_no_bitmap done +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: find costs +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: simplify interference graph +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: initialize low and high +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: lo = 6075 +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: hi = 0 +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: inf = 0 +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: total = 6075 +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: simplify +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: new candidates = 0 +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: select ranges +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: no more spills +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-08-07T13:54:03Z INFO 49129 (sg02) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-08-07T13:54:03Z USER 49129 (sg02) [ModuleForkPass]: coloring_allocator_psum finished after 0.268 seconds +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 383mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9685 memory location(s), 1 block(s), and 50755 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:03Z USER 49129 (sg02) [ModuleForkPass]: Running dma_optimization_psum +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=9685 blocks=1 instructions=50755 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:03Z INFO 49129 (sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:03Z INFO 49129 (sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:03Z USER 49129 (sg02) [ModuleForkPass]: dma_optimization_psum finished after 0.052 seconds +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 383mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9685 memory location(s), 1 block(s), and 50755 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:03Z USER 49129 (sg02) [ModuleForkPass]: Running address_rotation_psum +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=9685 blocks=1 instructions=50755 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:03Z INFO 49129 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-08-07T13:54:03Z INFO 49129 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 1 PSUM Banks +2025-08-07T13:54:03Z INFO 49129 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 1 PSUM Banks +2025-08-07T13:54:03Z USER 49129 (sg02) [ModuleForkPass]: address_rotation_psum finished after 0.338 seconds +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 383mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9685 memory location(s), 1 block(s), and 50755 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:03Z USER 49129 (sg02) [ModuleForkPass]: Running coloring_allocator_sb +2025-08-07T13:54:03Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=9685 blocks=1 instructions=50755 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:03Z INFO 49129 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 775793182 +2025-08-07T13:54:03Z INFO 49129 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 7946 bytes +2025-08-07T13:54:03Z INFO 49129 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 2414602 +2025-08-07T13:54:03Z INFO 49129 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 1480 bytes +2025-08-07T13:54:03Z INFO 49129 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 8196 +2025-08-07T13:54:03Z INFO 49129 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 248 bytes +2025-08-07T13:54:03Z INFO 49129 (sg02) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:03Z INFO 49129 (sg02) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:03Z INFO 49129 (sg02) [SB_Allocator]: allocating SB +2025-08-07T13:54:03Z INFO 49129 (sg02) [SB_Allocator]: main loop +2025-08-07T13:54:03Z INFO 49129 (sg02) [SB_Allocator]: renumber locations +2025-08-07T13:54:03Z INFO 49129 (sg02) [SB_Allocator]: size = 3571 +2025-08-07T13:54:03Z INFO 49129 (sg02) [SB_Allocator]: find partners +2025-08-07T13:54:03Z INFO 49129 (sg02) [SB_Allocator]: found 6071 accumulation groups +2025-08-07T13:54:03Z INFO 49129 (sg02) [SB_Allocator]: largest = _dot.256-t851_i4 +2025-08-07T13:54:03Z INFO 49129 (sg02) [SB_Allocator]: tensors = 50 +2025-08-07T13:54:03Z INFO 49129 (sg02) [SB_Allocator]: requires 61440 bytes/partition +2025-08-07T13:54:03Z INFO 49129 (sg02) [SB_Allocator]: expanding partners +2025-08-07T13:54:03Z INFO 49129 []: find first defs for local +2025-08-07T13:54:04Z INFO 49129 []: find first defs for global +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: find loads +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: 1 pin count +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: 710 remat count +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: build interference graph +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: pass 1 int-tree +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: Num intervals 3571 Num locations 3571 +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: info.neighbors init Done +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: edge: 29963 +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: mean: 16.7813 +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: median: 10.398 +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: find costs +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: simplify interference graph +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: safe = 3267 +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: unsafe = 126 +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: inf = 177 +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: total = 3570 +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: simplify +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 122 #Pinned 0 #Safe 0 minCost 0.00361083 maxCost 1.29711 locations 3571 +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: new candidates = 98 +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: select ranges +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: Total: 3570 +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: Spilled: 0.000 (0) +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: Allocated: 1.000 (3570) +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: Rover zone: 0.928 (3313) +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: Pre-rover zone: 0.007 (25) +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: Post-rover zone: 0.064 (228) +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: Slice zone: 0.001 (4) +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: Blocks nothing: 0.057 (203) +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: Blocks medium: 0.003 (9) +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.654 +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: Visited until medium blocking (median): 0.711 +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.737 +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: Blocks tall: 0.941 (3358) +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.797 +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: Visited until tall blocking (median): 0.989 +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:54:04Z INFO 49129 (sg02) [SB_Allocator]: Success +2025-08-07T13:54:28Z INFO 49129 (sg02) [SB_Allocator]: SB spills = 0 tensors +2025-08-07T13:54:28Z INFO 49129 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:28Z INFO 49129 (sg02) [SB_Allocator]: remats = 0 tensors +2025-08-07T13:54:28Z INFO 49129 (sg02) [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:54:28Z INFO 49129 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:28Z INFO 49129 (sg02) [SB_Allocator]: SB score = 0 +2025-08-07T13:54:28Z INFO 49129 (sg02) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-08-07T13:54:28Z INFO 49129 (sg02) [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-08-07T13:54:28Z INFO 49129 (sg02) [SB_Allocator]: pinning saved approximately 9010 cycles +2025-08-07T13:54:28Z INFO 49129 (sg02) [SB_Allocator]: 0% SB utilization after allocation +2025-08-07T13:54:28Z INFO 49129 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 775793182 +2025-08-07T13:54:28Z INFO 49129 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 7946 bytes +2025-08-07T13:54:28Z INFO 49129 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 2414602 +2025-08-07T13:54:28Z INFO 49129 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 1480 bytes +2025-08-07T13:54:28Z INFO 49129 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 8196 +2025-08-07T13:54:28Z INFO 49129 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 248 bytes +2025-08-07T13:54:28Z USER 49129 (sg02) [ModuleForkPass]: coloring_allocator_sb finished after 24.880 seconds +2025-08-07T13:54:28Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 383mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:28Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9685 memory location(s), 1 block(s), and 50755 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:28Z USER 49129 (sg02) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:28Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=9685 blocks=1 instructions=50755 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:28Z INFO 49129 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:28Z USER 49129 (sg02) [ModuleForkPass]: address_rotation_sb finished after 0.132 seconds +2025-08-07T13:54:28Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 383mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:28Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9685 memory location(s), 1 block(s), and 50755 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:28Z USER 49129 (sg02) [ModuleForkPass]: Running dma_optimization_sb +2025-08-07T13:54:28Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=9685 blocks=1 instructions=50755 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:28Z INFO 49129 (sg02) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 778207784, 99.3792% input load, 5.14002e-07% output write, 0.620817% spill/reload [sg0002] +2025-08-07T13:54:28Z INFO 49129 (sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:28Z INFO 49129 (sg02) [DMAOptimizationBase]: removed 0 identical load +2025-08-07T13:54:28Z INFO 49129 (sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:28Z INFO 49129 (sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:28Z INFO 49129 (sg02) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-08-07T13:54:28Z INFO 49129 (sg02) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(7.73377e+08) +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload instructions +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload memory locations +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 4100, 0.0848642% out of total spill/reload dma traffic +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 2 spill/reload instructions +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 1 spill/reload memory locations +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: average loaded DMA size 7957 bytes +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: average saved DMA size 1608 bytes +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 775790876 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 7957 bytes +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 2412296 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 1608 bytes +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 512, 0.0105977% out of total spill/reload dma traffic +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 4612, 0.000592644% out of total dma traffic +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 778203172, 99.3798% input load, 5.14005e-07% output write, 0.620228% spill/reload [sg0002] +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 775790876 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 7957 bytes +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 2412296 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 1608 bytes +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 8196 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 248 bytes +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 7858 bytes +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: dma_optimization_sb finished after 0.330 seconds +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 386mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50748 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=9677 blocks=1 instructions=50748 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: SB Rotation rotated 302 Sb address +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: SB Rotation rotated 271 Sb address +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: SB Rotation rotated 165 Sb address +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: SB Rotation rotated 225 Sb address +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: address_rotation_sb finished after 0.295 seconds +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 386mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50748 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: Running coloring_allocator_dram +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=9677 blocks=1 instructions=50748 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z INFO 49129 (sg02) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:29Z INFO 49129 (sg02) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: reserved space = 775473690 bytes +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: spill space = 4513540 bytes +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: aligned spill space = 4554752 bytes +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: renumber locations +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: size = 18 +2025-08-07T13:54:29Z INFO 49129 []: find first defs for local +2025-08-07T13:54:29Z INFO 49129 []: find first defs for global +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: Num intervals 18 Num locations 18 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: simplify interference graph +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: initialize low and high +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: lo = 18 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: hi = 0 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: total = 18 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: simplify +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: select ranges +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: allreduce_dram_hwm 2113536 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: Real CC buffer size 2113536 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: DRAM hwm after allocation: 3162112 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: coloring_allocator_dram finished after 0.075 seconds +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 386mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50748 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: Running address_rotation_dram +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=9677 blocks=1 instructions=50748 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: Runtime page size at 512MB +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: DRAM hwm before rotation 3162112 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: allreduce hwm 2113536 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: Real CC buffer size 2113536 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: DRAM hwm after rotation 3162112 +2025-08-07T13:54:29Z INFO 49129 (sg02) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: address_rotation_dram finished after 0.038 seconds +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 386mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50748 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: Running tensorcopy_accel +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=9677 blocks=1 instructions=50748 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z INFO 49129 (sg02) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-08-07T13:54:29Z INFO 49129 (sg02) [TensorCopyAccel::Impl]: Accelerated 0 out of 6038 tensorcopy in Function: sg0002 average acceleration factor: -nan +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: tensorcopy_accel finished after 0.005 seconds +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 386mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50748 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: Running peephole_opts +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=9677 blocks=1 instructions=50748 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z INFO 49129 (sg02) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: peephole_opts finished after 0.017 seconds +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 386mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: Running lower_kernel +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z INFO 49129 (sg02) [LowerKernel]: Started running LowerKernel +2025-08-07T13:54:29Z INFO 49129 (sg02) [LowerKernel]: Start of kernel lowering pass, number of insts: 50751, number of allocs: 9677 +2025-08-07T13:54:29Z INFO 49129 (sg02) [LowerKernel]: Scan BKs time (s): 0.002976 +2025-08-07T13:54:29Z INFO 49129 (sg02) [LowerKernel]: Lower BKs time (s): 5e-06 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: lower_kernel finished after 0.004 seconds +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 386mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.003 seconds +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 386mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.005 seconds +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 386mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: birverifier finished after 0.041 seconds +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 386mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: Running dynamic_dma_scan +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: dynamic_dma_scan finished after 0.006 seconds +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 386mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: Running build_fdeps +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z INFO 49129 (sg02) [build_flow_deps]: Start build fdeps. Invocation: 8Thu Aug 7 13:54:29 2025 +2025-08-07T13:54:29Z INFO 49129 (sg02) [build_flow_deps]: Allocs: 9677 instructions: 50751 +2025-08-07T13:54:29Z INFO 49129 (sg02) [build_flow_deps]: Build fdeps inserted 177889 edges +2025-08-07T13:54:29Z INFO 49129 (sg02) [build_flow_deps]: Done build fdeps 177889 Thu Aug 7 13:54:29 2025 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: build_fdeps finished after 0.168 seconds +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 390mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: Running remove_redundancies +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z INFO 49129 (sg02) [RemoveRedundancies]: remove_clobbered_writes +2025-08-07T13:54:29Z INFO 49129 (sg02) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-08-07T13:54:29Z INFO 49129 (sg02) [RemoveRedundancies]: remove_useless_insts +2025-08-07T13:54:29Z INFO 49129 (sg02) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: remove_redundancies finished after 0.018 seconds +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 390mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z USER 49129 (sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:29Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:29Z INFO 49129 (sg02) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:29Z INFO 49129 (sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:29Z INFO 49129 (sg02) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:30Z USER 49129 (sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.224 seconds +2025-08-07T13:54:30Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 417mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:30Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:30Z USER 49129 (sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:30Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:30Z INFO 49129 (sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:30Z INFO 49129 (sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:30Z USER 49129 (sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.047 seconds +2025-08-07T13:54:30Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 417mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:30Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:30Z USER 49129 (sg02) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-08-07T13:54:30Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:30Z USER 49129 (sg02) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-08-07T13:54:30Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 417mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:30Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:30Z USER 49129 (sg02) [ModuleForkPass]: Running post_sched +2025-08-07T13:54:30Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:30Z INFO 49129 [post_scheduler]: Start PosT ScheD 3 sunda Thu Aug 7 13:54:30 2025 +2025-08-07T13:54:30Z INFO 49129 [post_scheduler]: Time-aware hwm post-sched +2025-08-07T13:54:31Z INFO 49129 [post_scheduler]: Time-aware simulation time: 5657487 +2025-08-07T13:54:31Z INFO 49129 [post_scheduler]: Done PosT ScheD Thu Aug 7 13:54:31 2025 +2025-08-07T13:54:31Z USER 49129 (sg02) [ModuleForkPass]: post_sched finished after 1.336 seconds +2025-08-07T13:54:31Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 455mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:31Z USER 49129 (sg02) [ModuleForkPass]: Running expand_scheduling_units +2025-08-07T13:54:31Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:31Z USER 49129 (sg02) [ModuleForkPass]: expand_scheduling_units finished after 0.006 seconds +2025-08-07T13:54:31Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 455mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:31Z USER 49129 (sg02) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:31Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:31Z INFO 49129 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 3740 PSUM Banks +2025-08-07T13:54:31Z INFO 49129 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 3864 PSUM Banks +2025-08-07T13:54:32Z INFO 49129 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 1 PSUM Banks +2025-08-07T13:54:32Z INFO 49129 (sg02) [DMAOptimizationBase]: SB Rotation rotated 9 Sb address +2025-08-07T13:54:32Z INFO 49129 (sg02) [DMAOptimizationBase]: SB Rotation rotated 31 Sb address +2025-08-07T13:54:32Z INFO 49129 (sg02) [DMAOptimizationBase]: SB Rotation rotated 30 Sb address +2025-08-07T13:54:32Z INFO 49129 (sg02) [DMAOptimizationBase]: SB Rotation rotated 4 Sb address +2025-08-07T13:54:32Z INFO 49129 (sg02) [DMAOptimizationBase]: SB Rotation rotated 106 Sb address +2025-08-07T13:54:32Z INFO 49129 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:32Z INFO 49129 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:32Z USER 49129 (sg02) [ModuleForkPass]: address_rotation_sb finished after 0.955 seconds +2025-08-07T13:54:32Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 456mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:32Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:32Z USER 49129 (sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:32Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:32Z INFO 49129 (sg02) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:32Z INFO 49129 (sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:32Z INFO 49129 (sg02) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:32Z USER 49129 (sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.168 seconds +2025-08-07T13:54:32Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:32Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:32Z USER 49129 (sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:32Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:32Z INFO 49129 (sg02) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:32Z INFO 49129 (sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-08-07T13:54:32Z INFO 49129 (sg02) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:32Z USER 49129 (sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.036 seconds +2025-08-07T13:54:32Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:32Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:32Z USER 49129 (sg02) [ModuleForkPass]: Running dep_opt +2025-08-07T13:54:32Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:32Z INFO 49129 (sg02) [build_flow_deps]: Start build fdeps. Invocation: 9Thu Aug 7 13:54:32 2025 +2025-08-07T13:54:32Z INFO 49129 (sg02) [build_flow_deps]: Allocs: 9677 instructions: 50751 +2025-08-07T13:54:32Z INFO 49129 (sg02) [build_flow_deps]: Build fdeps inserted 174221 edges +2025-08-07T13:54:32Z INFO 49129 (sg02) [build_flow_deps]: Done build fdeps 174221 Thu Aug 7 13:54:32 2025 +2025-08-07T13:54:32Z USER 49129 (sg02) [ModuleForkPass]: dep_opt finished after 0.254 seconds +2025-08-07T13:54:32Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:32Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:32Z USER 49129 (sg02) [ModuleForkPass]: Running report_stats +2025-08-07T13:54:32Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:32Z INFO 49129 (sg02) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 3145728 │ +│ DMACopy │ Internal │ 1 │ 1048576 │ +│ Load │ Const -> Internal │ 4 │ 34824 │ +│ Load │ ExternalInput -> Internal │ 760 │ 773341708 │ +│ Load │ Internal │ 19 │ 2414344 │ +│ Save │ Internal │ 610 │ 2412292 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-08-07T13:54:32Z INFO 49129 (sg02) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 64 │ 2 │ +│ 256 │ 2 │ +│ 512 │ 594 │ +│ 1024 │ 14 │ +│ 2048 │ 6 │ +│ 6144 │ 64 │ +│ 8192 │ 693 │ +│ 60768 │ 1 │ +│ 60776 │ 4 │ +│ 1048576 │ 3 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:32Z INFO 49129 (sg02) [ReportStats]: MM Stats: #MatMults 42227 #MatMult-Transposes 19699 +2025-08-07T13:54:32Z INFO 49129 (sg02) [ReportStats]: IO Tensor size combined: 773341712 +2025-08-07T13:54:32Z INFO 49129 (sg02) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input473 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input469 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input472 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input470 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input474 │ ExternalInput │ bfloat16 │ 8192 │ +│ input471 │ ExternalInput │ bfloat16 │ 8192 │ +│ input1 │ ExternalInput │ int32 │ 512 │ +│ input3 │ ExternalInput │ float32 │ 12 │ +│ output0 │ ExternalOutput │ int32 │ 4 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-08-07T13:54:32Z INFO 49129 (sg02) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────┼──────────┼──────────┼──────────────┤ +│ input469_local_766_i4 │ Internal │ bfloat16 │ 3145728 │ +│ input469_local_766_i3 │ Internal │ bfloat16 │ 3145728 │ +│ input469_local_766_i7 │ Internal │ bfloat16 │ 3145728 │ +│ input469_local_766_i5 │ Internal │ bfloat16 │ 3145728 │ +│ input469_local_766_i1 │ Internal │ bfloat16 │ 3145728 │ +│ input469_local_766_i6 │ Internal │ bfloat16 │ 3145728 │ +│ input469_local_766_i9 │ Internal │ bfloat16 │ 3145728 │ +│ input469_local_766_i8 │ Internal │ bfloat16 │ 3145728 │ +│ input469_local_766_i2 │ Internal │ bfloat16 │ 3145728 │ +│ input469_local_766_i0 │ Internal │ bfloat16 │ 3145728 │ +└───────────────────────┴──────────┴──────────┴──────────────┘ + +2025-08-07T13:54:32Z USER 49129 (sg02) [ModuleForkPass]: report_stats finished after 0.013 seconds +2025-08-07T13:54:32Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:32Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:32Z USER 49129 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:32Z USER 49129 [BackendPassManager]: mod_parallel_pass finished after 30.750 seconds +2025-08-07T13:54:32Z INFO 49129 [BackendPassManager]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:32Z INFO 49129 [BackendPassManager]: Output has 3 module(s), 3 function(s), 10897 memory location(s), 3 block(s), and 57355 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:32Z USER 49129 [BackendPassManager]: Running assign_trigger_engine +2025-08-07T13:54:32Z INFO 49129 [BackendPassManager]: Inputs to assign_trigger_engine: modules=3 functions=3 allocs=10897 blocks=3 instructions=57355 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:32Z INFO 49129 (sg00) [AssignTriggerEngine]: Assigned trigger engine for 85 DMA instructions. Moved 65 DMA instructions to CC's engines. +2025-08-07T13:54:32Z INFO 49129 (sg01) [AssignTriggerEngine]: Assigned trigger engine for 10 DMA instructions. Moved 2 DMA instructions to CC's engines. +2025-08-07T13:54:33Z INFO 49129 (sg02) [AssignTriggerEngine]: Assigned trigger engine for 614 DMA instructions. Moved 4 DMA instructions to CC's engines. +2025-08-07T13:54:33Z USER 49129 [BackendPassManager]: assign_trigger_engine finished after 0.036 seconds +2025-08-07T13:54:33Z INFO 49129 [BackendPassManager]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 [BackendPassManager]: Output has 3 module(s), 3 function(s), 10897 memory location(s), 3 block(s), and 57355 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:33Z INFO 49129 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=10897 blocks=3 instructions=57355 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 (sg00) [SubgraphForkPass]: Running lower_local_collectives +2025-08-07T13:54:33Z INFO 49129 (sg00) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:33Z USER 49129 (sg00) [SubgraphForkPass]: lower_local_collectives finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49129 (sg00) [SubgraphForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:33Z USER 49129 (sg00) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-08-07T13:54:33Z INFO 49129 (sg00) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:33Z USER 49129 (sg00) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49129 (sg00) [SubgraphForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:33Z USER 49129 (sg01) [SubgraphForkPass]: Running lower_local_collectives +2025-08-07T13:54:33Z USER 49129 (sg00) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:33Z INFO 49129 (sg00) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:33Z INFO 49129 (sg01) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:33Z USER 49129 (sg01) [SubgraphForkPass]: lower_local_collectives finished after 0.000 seconds +2025-08-07T13:54:33Z USER 49129 (sg02) [SubgraphForkPass]: Running lower_local_collectives +2025-08-07T13:54:33Z INFO 49129 (sg01) [SubgraphForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:33Z USER 49129 (sg01) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-08-07T13:54:33Z INFO 49129 (sg01) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:33Z USER 49129 (sg01) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49129 (sg01) [SubgraphForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:33Z USER 49129 (sg01) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:33Z INFO 49129 (sg01) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:33Z INFO 49129 (sg02) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 (sg02) [SubgraphForkPass]: lower_local_collectives finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49129 (sg02) [SubgraphForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 (sg02) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-08-07T13:54:33Z INFO 49129 (sg02) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 (sg02) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49129 (sg02) [SubgraphForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 (sg02) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:33Z INFO 49129 (sg02) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z INFO 49129 (sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:33Z INFO 49129 (sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:33Z USER 49129 (sg00) [SubgraphForkPass]: dead_code_elim finished after 0.004 seconds +2025-08-07T13:54:33Z INFO 49129 (sg00) [SubgraphForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:33Z USER 49129 (sg01) [SubgraphForkPass]: dead_code_elim finished after 0.005 seconds +2025-08-07T13:54:33Z INFO 49129 (sg01) [SubgraphForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:33Z INFO 49129 (sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:33Z USER 49129 (sg02) [SubgraphForkPass]: dead_code_elim finished after 0.046 seconds +2025-08-07T13:54:33Z INFO 49129 (sg02) [SubgraphForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:33Z USER 49129 [BackendPassManager]: subgraph_parallel_pass finished after 0.049 seconds +2025-08-07T13:54:33Z INFO 49129 [BackendPassManager]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 [BackendPassManager]: Output has 3 module(s), 3 function(s), 10897 memory location(s), 3 block(s), and 57355 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 [BackendPassManager]: Running assign_hwdge_engine +2025-08-07T13:54:33Z INFO 49129 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=3 functions=3 allocs=10897 blocks=3 instructions=57355 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 [BackendPassManager]: assign_hwdge_engine finished after 0.010 seconds +2025-08-07T13:54:33Z INFO 49129 [BackendPassManager]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 [BackendPassManager]: Output has 3 module(s), 3 function(s), 10897 memory location(s), 3 block(s), and 57355 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:33Z INFO 49129 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=10897 blocks=3 instructions=57355 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 (sg00) [ModuleForkPass]: Running alloc_queues +2025-08-07T13:54:33Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:33Z INFO 49129 (sg00) [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-08-07T13:54:33Z USER 49129 (sg01) [ModuleForkPass]: Running alloc_queues +2025-08-07T13:54:33Z USER 49129 (sg02) [ModuleForkPass]: Running alloc_queues +2025-08-07T13:54:33Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:33Z INFO 49129 (sg01) [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-08-07T13:54:33Z INFO 49129 (sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 1 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 3 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 20 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 64 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 61 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-08-07T13:54:33Z USER 49129 (sg00) [ModuleForkPass]: alloc_queues finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:33Z USER 49129 (sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-08-07T13:54:33Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:33Z USER 49129 (sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z INFO 49129 (sg02) [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-08-07T13:54:33Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:33Z USER 49129 (sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-08-07T13:54:33Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:33Z USER 49129 (sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:33Z USER 49129 (sg00) [ModuleForkPass]: Running lower_control +2025-08-07T13:54:33Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:33Z INFO 49129 (sg01) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 1 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 3 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 8 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 1 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 217 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-08-07T13:54:33Z USER 49129 (sg01) [ModuleForkPass]: alloc_queues finished after 0.001 seconds +2025-08-07T13:54:33Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:33Z USER 49129 (sg01) [ModuleForkPass]: Running chain_dma_transposes +2025-08-07T13:54:33Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:33Z USER 49129 (sg01) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:33Z USER 49129 (sg01) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-08-07T13:54:33Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:33Z USER 49129 (sg01) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:33Z USER 49129 (sg01) [ModuleForkPass]: Running lower_control +2025-08-07T13:54:33Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:33Z INFO 49129 (sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-08-07T13:54:33Z USER 49129 (sg00) [ModuleForkPass]: lower_control finished after 0.002 seconds +2025-08-07T13:54:33Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:33Z USER 49129 (sg00) [ModuleForkPass]: Running dep_reduction +2025-08-07T13:54:33Z INFO 49129 (sg00) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=518 blocks=1 instructions=1465 Max writers: 32 Max Readers: 72 +2025-08-07T13:54:33Z INFO 49129 (sg00) [DepReduction]: Start Dependency Reduction +2025-08-07T13:54:33Z INFO 49129 (sg00) [DepReduction]: Processing async instrs... +2025-08-07T13:54:33Z INFO 49129 (sg00) [DepReduction]: Processing secondary edges per engine... +2025-08-07T13:54:33Z INFO 49129 (sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 1338 +2025-08-07T13:54:33Z INFO 49129 (sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 1462 +2025-08-07T13:54:33Z INFO 49129 (sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 1462 +2025-08-07T13:54:33Z INFO 49129 (sg01) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-08-07T13:54:33Z USER 49129 (sg01) [ModuleForkPass]: lower_control finished after 0.005 seconds +2025-08-07T13:54:33Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:33Z USER 49129 (sg01) [ModuleForkPass]: Running dep_reduction +2025-08-07T13:54:33Z INFO 49129 (sg01) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=702 blocks=1 instructions=5139 Max writers: 48 Max Readers: 384 +2025-08-07T13:54:33Z INFO 49129 (sg01) [DepReduction]: Start Dependency Reduction +2025-08-07T13:54:33Z INFO 49129 (sg02) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 5 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 19 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 603 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 7 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 4 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 757 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-08-07T13:54:33Z USER 49129 (sg02) [ModuleForkPass]: alloc_queues finished after 0.009 seconds +2025-08-07T13:54:33Z INFO 49129 (sg01) [DepReduction]: Processing async instrs... +2025-08-07T13:54:33Z INFO 49129 (sg01) [DepReduction]: Processing secondary edges per engine... +2025-08-07T13:54:33Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 (sg02) [ModuleForkPass]: Running chain_dma_transposes +2025-08-07T13:54:33Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 (sg02) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 (sg02) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-08-07T13:54:33Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 (sg02) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 (sg02) [ModuleForkPass]: Running lower_control +2025-08-07T13:54:33Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z INFO 49129 (sg01) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 5803 +2025-08-07T13:54:33Z INFO 49129 (sg00) [DepReduction]: Num Async removed: 0 +2025-08-07T13:54:33Z INFO 49129 (sg00) [DepReduction]: Finished dependency reduction: 9132 removed, new total 510 +2025-08-07T13:54:33Z INFO 49129 (sg00) [DepReduction]: Finished Dependency Reduction +2025-08-07T13:54:33Z USER 49129 (sg00) [ModuleForkPass]: dep_reduction finished after 0.010 seconds +2025-08-07T13:54:33Z INFO 49129 (sg00) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 518 memory location(s), 1 block(s), and 1465 instruction(s). Max writers: 32 Max Readers: 72 +2025-08-07T13:54:33Z INFO 49129 (sg01) [DepReduction]: Processing redundant descendants, Done. Num edges removed 6010 +2025-08-07T13:54:33Z INFO 49129 (sg01) [DepReduction]: Processing async instrs, Done. Num edges removed 6010 +2025-08-07T13:54:33Z INFO 49129 (sg01) [DepReduction]: Num Async removed: 0 +2025-08-07T13:54:33Z INFO 49129 (sg01) [DepReduction]: Finished dependency reduction: 31089 removed, new total 853 +2025-08-07T13:54:33Z INFO 49129 (sg01) [DepReduction]: Finished Dependency Reduction +2025-08-07T13:54:33Z USER 49129 (sg01) [ModuleForkPass]: dep_reduction finished after 0.032 seconds +2025-08-07T13:54:33Z INFO 49129 (sg01) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 702 memory location(s), 1 block(s), and 5139 instruction(s). Max writers: 48 Max Readers: 384 +2025-08-07T13:54:33Z INFO 49129 (sg02) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-08-07T13:54:33Z USER 49129 (sg02) [ModuleForkPass]: lower_control finished after 0.071 seconds +2025-08-07T13:54:33Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 (sg02) [ModuleForkPass]: Running dep_reduction +2025-08-07T13:54:33Z INFO 49129 (sg02) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=9677 blocks=1 instructions=50751 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z INFO 49129 (sg02) [DepReduction]: Start Dependency Reduction +2025-08-07T13:54:33Z INFO 49129 (sg02) [DepReduction]: Processing async instrs... +2025-08-07T13:54:33Z INFO 49129 (sg02) [DepReduction]: Processing secondary edges per engine... +2025-08-07T13:54:33Z INFO 49129 (sg02) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 44613 +2025-08-07T13:54:33Z INFO 49129 (sg02) [DepReduction]: Processing redundant descendants, Done. Num edges removed 46001 +2025-08-07T13:54:33Z INFO 49129 (sg02) [DepReduction]: Processing async instrs, Done. Num edges removed 46001 +2025-08-07T13:54:33Z INFO 49129 (sg02) [DepReduction]: Num Async removed: 0 +2025-08-07T13:54:33Z INFO 49129 (sg02) [DepReduction]: Finished dependency reduction: 308387 removed, new total 15007 +2025-08-07T13:54:33Z INFO 49129 (sg02) [DepReduction]: Finished Dependency Reduction +2025-08-07T13:54:33Z USER 49129 (sg02) [ModuleForkPass]: dep_reduction finished after 0.621 seconds +2025-08-07T13:54:33Z INFO 49129 (sg02) [ModuleForkPass]: curr_vmrss: 472mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9677 memory location(s), 1 block(s), and 50751 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:33Z USER 49129 [BackendPassManager]: mod_parallel_pass finished after 0.742 seconds +2025-08-07T13:54:33Z INFO 49129 [BackendPassManager]: curr_vmrss: 452mb, ru_maxrss: 518mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49129 [BackendPassManager]: Output has 3 module(s), 3 function(s), 10897 memory location(s), 3 block(s), and 57355 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 [BackendPassManager]: Running nc_parallel_pass +2025-08-07T13:54:33Z INFO 49129 [BackendPassManager]: Inputs to nc_parallel_pass: modules=3 functions=3 allocs=10897 blocks=3 instructions=57355 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z USER 49129 [CoreForkPass]: Running bir_linker +2025-08-07T13:54:33Z INFO 49129 [CoreForkPass]: Inputs to bir_linker: modules=3 functions=3 allocs=10897 blocks=3 instructions=57355 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:33Z INFO 49129 (sgLnk) [BirLinker]: bir_linker cwd: +2025-08-07T13:54:33Z INFO 49129 (sgLnk) [BirLinker]: Num intermediates 111 +2025-08-07T13:54:33Z INFO 49129 (sgLnk) [BirLinker]: Num Module Definitions 3 +2025-08-07T13:54:33Z INFO 49129 (sgLnk) [BirLinker]: Linking to a call-graph structure +2025-08-07T13:54:33Z INFO 49129 (sgLnk) [BirLinker]: Added a new SpillReload Que qPoolPIOParam0 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [BirLinker]: tensor_map verification successful. +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [BirLinker]: Writing updated tensor_map /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/neuronxcc-vebk23i5/sgLnk/sg00/tensor_map.json +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [BirLinker]: PostLink Stats: #MatMults 199502 #MatMult-Transposes 24041 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [BirLinker]: Total Intermediate MMTs 1190 #out: 1120 #inp: 70 #symmetric: 0 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [BirLinker]: Total Intermediate IOs with MMTs: 37 #out: 35 #inp: 2 #both: 0 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [BirLinker]: releasing pre-link modules +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [BirLinker]: linking Done. +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: bir_linker finished after 0.788 seconds +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: curr_vmrss: 716mb, ru_maxrss: 716mb (delta=198mb) +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 57415 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: Running postlnk_dma_report +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Inputs to postlnk_dma_report: modules=1 functions=4 allocs=11556 blocks=4 instructions=57415 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DMAReport]: DMA Report: Bytes loaded or saved 1055562024, 95.5358% input load, 0.206438% output write, 4.25772% spill/reload +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: postlnk_dma_report finished after 0.007 seconds +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: curr_vmrss: 406mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 57415 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: Running report_stats +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Inputs to report_stats: modules=1 functions=4 allocs=11556 blocks=4 instructions=57415 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ReportStats]: Data Movement Statistics: main +┌─────────────┬──────┬───────┬───────┐ +│ Instruction │ Kind │ Count │ Bytes │ +└─────────────┴──────┴───────┴───────┘ + +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ReportStats]: Data Movement Statistics: sg0000 +┌──────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 622329856 │ +│ DMACopy │ Internal -> ExternalOutput │ 8 │ 8388608 │ +│ DMACopy │ Internal -> Output │ 1 │ 2097152 │ +│ Load │ Const -> Internal │ 3 │ 37120 │ +│ Load │ ExternalInput -> Internal │ 45 │ 41952768 │ +│ Load │ Internal │ 32 │ 1048576 │ +│ Load (Spill) │ Internal │ 32 │ 33300480 │ +│ Save │ Internal │ 20 │ 1572864 │ +│ Save │ Internal -> Output │ 8 │ 1130498 │ +└──────────────┴────────────────────────────┴───────┴───────────┘ + +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 3 │ +│ 4 │ 1 │ +│ 32 │ 1 │ +│ 64 │ 1 │ +│ 128 │ 1 │ +│ 256 │ 52 │ +│ 512 │ 1 │ +│ 2048 │ 4 │ +│ 4096 │ 1 │ +│ 8192 │ 44 │ +│ 32520 │ 32 │ +│ 262144 │ 8 │ +│ 1048576 │ 2 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 3145728 │ +│ DMACopy │ Internal -> ExternalOutput │ 8 │ 8388608 │ +│ DMACopy │ Internal -> Output │ 1 │ 2097152 │ +│ Load │ Const -> Internal │ 2 │ 36864 │ +│ Load │ ExternalInput -> Internal │ 204 │ 192954880 │ +│ Load │ Input -> Internal │ 3 │ 81920 │ +│ Load │ Internal │ 2 │ 2097152 │ +│ Save │ Internal │ 8 │ 2097152 │ +│ Save │ Internal -> Output │ 2 │ 1048578 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 3 │ +│ 32 │ 1 │ +│ 64 │ 2 │ +│ 128 │ 1 │ +│ 256 │ 3 │ +│ 2048 │ 8 │ +│ 6144 │ 64 │ +│ 8192 │ 139 │ +│ 262144 │ 8 │ +│ 1048576 │ 5 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 3145728 │ +│ DMACopy │ Internal │ 1 │ 1048576 │ +│ Load │ Const -> Internal │ 4 │ 34824 │ +│ Load │ ExternalInput -> Internal │ 760 │ 773341708 │ +│ Load │ Internal │ 19 │ 2414344 │ +│ Save │ Internal │ 610 │ 2412292 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 64 │ 2 │ +│ 256 │ 2 │ +│ 512 │ 594 │ +│ 1024 │ 14 │ +│ 2048 │ 6 │ +│ 6144 │ 64 │ +│ 8192 │ 693 │ +│ 60768 │ 1 │ +│ 60776 │ 4 │ +│ 1048576 │ 3 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ReportStats]: MM Stats: #MatMults 47624 #MatMult-Transposes 19893 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ReportStats]: IO Tensor size combined: 9981007404 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input76_sg0000 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input473_sg0002 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input76 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input473 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input131 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input109 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input98 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input153 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input87 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input175 │ ExternalInput │ bfloat16 │ 50331648 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────────────────┼──────────┼──────────┼──────────────┤ +│ input89_local_952_sg0001 │ Internal │ bfloat16 │ 4194304 │ +│ input78_local_1059_sg0000 │ Internal │ bfloat16 │ 4194304 │ +│ input84_local_886_i2_sg0001 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_886_i5_sg0001 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_886_i1_sg0001 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_886_i3_sg0001 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_886_i4_sg0001 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_886_i7_sg0001 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_886_i6_sg0001 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_886_i0_sg0001 │ Internal │ bfloat16 │ 3145728 │ +└─────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: report_stats finished after 0.014 seconds +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: curr_vmrss: 406mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 57415 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: Running coloring_allocator_dram_post_lnk +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Inputs to coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=11556 blocks=4 instructions=57415 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Local +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: reserved space = 8342039572 bytes +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: spill space = 75579464 bytes +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: aligned spill space = 75726848 bytes +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: renumber locations +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: size = 111 +2025-08-07T13:54:34Z INFO 49129 []: find first defs for local +2025-08-07T13:54:34Z INFO 49129 []: find first defs for global +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: Num intervals 111 Num locations 111 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: simplify interference graph +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: initialize low and high +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: lo = 111 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: hi = 0 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: total = 111 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: simplify +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 5242880 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: select ranges +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 5242880 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: Real CC buffer size 5242880 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 10579968 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: coloring_allocator_dram_post_lnk finished after 0.042 seconds +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: curr_vmrss: 406mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 57415 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_post_lnk +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=11556 blocks=4 instructions=57415 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_post_lnk finished after 0.028 seconds +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: curr_vmrss: 406mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 57415 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: Running lower_dynamic_dma +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=4 allocs=11556 blocks=4 instructions=57415 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: lower_dynamic_dma finished after 0.007 seconds +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: curr_vmrss: 406mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 57415 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: Running legalize_dynamic_dma +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=4 allocs=11556 blocks=4 instructions=57415 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: legalize_dynamic_dma finished after 0.020 seconds +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: curr_vmrss: 406mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 57415 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: Running lower_dma +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Inputs to lower_dma: modules=1 functions=4 allocs=11556 blocks=4 instructions=57415 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 7935/7935 (100% DGE) + power-of-2 partition : 7976/8017 (99.4886% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 7976/8017 (99.4886% DGE) + Cast (DGE/DMA) + 128 partition : 147/147 (100% DGE) + power-of-2 partition : 147/148 (99.3243% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 147/148 (99.3243% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 0/487 (0% DGE) + power-of-2 partition : 0/1140 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/1140 (0% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 36 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 289/289 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: lower_dma finished after 0.048 seconds +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: curr_vmrss: 406mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 57415 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: Running expand_all_engine +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=4 allocs=11556 blocks=4 instructions=57415 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: expand_all_engine finished after 0.009 seconds +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: curr_vmrss: 406mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 57415 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: Running alloc_semaphores +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=4 allocs=11556 blocks=4 instructions=57415 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: alloc_semaphores finished after 0.046 seconds +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: curr_vmrss: 406mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 57415 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: Running expand_inst_late +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=4 allocs=11556 blocks=4 instructions=57415 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: expand_inst_late finished after 0.044 seconds +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: curr_vmrss: 406mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 57439 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: Running seq_inst_opt +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=4 allocs=11556 blocks=4 instructions=57439 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [SeqInstOpt]: Removing 10 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [SeqInstOpt]: Removing 7 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: seq_inst_opt finished after 0.006 seconds +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: curr_vmrss: 406mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 57422 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: Running lower_sync +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Inputs to lower_sync: modules=1 functions=4 allocs=11556 blocks=4 instructions=57422 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: lower_sync finished after 0.018 seconds +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: curr_vmrss: 406mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 59204 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: Running lower_act +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Inputs to lower_act: modules=1 functions=4 allocs=11556 blocks=4 instructions=59204 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: lower_act finished after 0.007 seconds +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: curr_vmrss: 406mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 59216 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z USER 49129 [CoreForkPass]: Running lower_dve +2025-08-07T13:54:34Z INFO 49129 [CoreForkPass]: Inputs to lower_dve: modules=1 functions=4 allocs=11556 blocks=4 instructions=59216 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:34Z INFO 49129 (sgLnk) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json +2025-08-07T13:54:35Z USER 49129 [CoreForkPass]: lower_dve finished after 0.071 seconds +2025-08-07T13:54:35Z INFO 49129 [CoreForkPass]: curr_vmrss: 411mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49129 [CoreForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 59216 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z USER 49129 [CoreForkPass]: Running lower_ap +2025-08-07T13:54:35Z INFO 49129 [CoreForkPass]: Inputs to lower_ap: modules=1 functions=4 allocs=11556 blocks=4 instructions=59216 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z USER 49129 [CoreForkPass]: lower_ap finished after 0.011 seconds +2025-08-07T13:54:35Z INFO 49129 [CoreForkPass]: curr_vmrss: 411mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49129 [CoreForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 59216 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z USER 49129 [CoreForkPass]: Running coloring_allocator_reg +2025-08-07T13:54:35Z INFO 49129 [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=4 allocs=11556 blocks=4 instructions=59216 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: allocating REG +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: allocating REG +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: renumber registers +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: size = 3 +2025-08-07T13:54:35Z INFO 49129 []: find first defs for local reg +2025-08-07T13:54:35Z INFO 49129 []: find first defs for global reg +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: live range analysis +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: find costs +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: simplify interference graph +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: initialize low and high +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: lo = 3 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: hi = 0 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: inf = 0 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: total = 3 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: simplify +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: new candidates = 0 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: select ranges +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: no more spills +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: allocating REG +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: renumber registers +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: size = 1 +2025-08-07T13:54:35Z INFO 49129 []: find first defs for local reg +2025-08-07T13:54:35Z INFO 49129 []: find first defs for global reg +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: live range analysis +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: find costs +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: simplify interference graph +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: initialize low and high +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: lo = 1 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: hi = 0 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: inf = 0 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: total = 1 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: simplify +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: new candidates = 0 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: select ranges +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: no more spills +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: allocating REG +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: renumber registers +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: size = 4 +2025-08-07T13:54:35Z INFO 49129 []: find first defs for local reg +2025-08-07T13:54:35Z INFO 49129 []: find first defs for global reg +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: live range analysis +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: find costs +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: simplify interference graph +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: initialize low and high +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: lo = 4 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: hi = 0 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: inf = 0 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: total = 4 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: simplify +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: new candidates = 0 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: select ranges +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: no more spills +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-08-07T13:54:35Z USER 49129 [CoreForkPass]: coloring_allocator_reg finished after 0.079 seconds +2025-08-07T13:54:35Z INFO 49129 [CoreForkPass]: curr_vmrss: 414mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49129 [CoreForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 59216 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z USER 49129 [BackendPassManager]: nc_parallel_pass finished after 1.299 seconds +2025-08-07T13:54:35Z INFO 49129 [BackendPassManager]: curr_vmrss: 414mb, ru_maxrss: 716mb (delta=198mb) +2025-08-07T13:54:35Z INFO 49129 [BackendPassManager]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 59216 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z USER 49129 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:35Z INFO 49129 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=4 allocs=11556 blocks=4 instructions=59216 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z USER 49129 [ModuleForkPass]: Running birverifier +2025-08-07T13:54:35Z INFO 49129 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=4 allocs=11556 blocks=4 instructions=59216 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z USER 49129 [ModuleForkPass]: birverifier finished after 0.063 seconds +2025-08-07T13:54:35Z INFO 49129 [ModuleForkPass]: curr_vmrss: 420mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49129 [ModuleForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 59216 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z USER 49129 [BackendPassManager]: mod_parallel_pass finished after 0.065 seconds +2025-08-07T13:54:35Z INFO 49129 [BackendPassManager]: curr_vmrss: 420mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49129 [BackendPassManager]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 59216 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z USER 49129 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:35Z INFO 49129 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=4 allocs=11556 blocks=4 instructions=59216 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z USER 49129 [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:35Z INFO 49129 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=4 allocs=11556 blocks=4 instructions=59216 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z USER 49129 [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:54:35Z INFO 49129 [SubgraphForkPass]: curr_vmrss: 420mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49129 [SubgraphForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 59216 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z USER 49129 [BackendPassManager]: subgraph_parallel_pass finished after 0.002 seconds +2025-08-07T13:54:35Z INFO 49129 [BackendPassManager]: curr_vmrss: 420mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49129 [BackendPassManager]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 59216 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z USER 49129 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:35Z INFO 49129 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=4 allocs=11556 blocks=4 instructions=59216 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z USER 49129 [ModuleForkPass]: Running codegen +2025-08-07T13:54:35Z INFO 49129 [ModuleForkPass]: Inputs to codegen: modules=1 functions=4 allocs=11556 blocks=4 instructions=59216 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [Codegen]: Total compiler allocated DRAM tensors: 0.00985336 GB +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 7.6285 │ +│ ExternalOutput │ 0.0703125 │ +│ Const │ 0.000101335 │ +└────────────────┴─────────────┘ + +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [Codegen]: Total runtime managed DRAM tensors: 7.69892 GB +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [Codegen]: Instruction Stats: +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 47625 │ +│ LDWEIGHTS │ 47474 │ +│ ACTIVATE │ 6708 │ +│ EVENT_SEMAPHORE │ 1782 │ +│ UNKNOWN(0xd4) │ 1035 │ +│ PSEUDO_DMA_TRIGGER │ 742 │ +│ MATCH_VALUE_LOAD │ 441 │ +│ FIND_INDEX8 │ 224 │ +│ MAX8 │ 224 │ +│ MATCH_REPLACE8 │ 217 │ +│ UNKNOWN(0xd3) │ 185 │ +│ TENSOR_SCALAR_ADDR │ 151 │ +│ TENSOR_TENSOR │ 150 │ +│ POOL_BUFFER_LOAD │ 99 │ +│ GATHER │ 99 │ +│ UNKNOWN(0x8b) │ 97 │ +│ MEMSET │ 30 │ +│ UNKNOWN(0xda) │ 25 │ +│ TENSOR_SCALAR │ 23 │ +│ PSEUDO_BRANCH_LABEL │ 20 │ +│ UNKNOWN(0x8a) │ 16 │ +│ TENSOR_REDUCE │ 16 │ +│ UNKNOWN(0xd2) │ 15 │ +│ ACT_TABLE_LOAD │ 12 │ +│ COPY │ 11 │ +│ CAST │ 10 │ +│ UNKNOWN(0xcf) │ 10 │ +│ PSEUDO_DMA_REARM │ 10 │ +│ UNKNOWN(0x8d) │ 8 │ +│ UNKNOWN(0xd9) │ 7 │ +│ UNKNOWN(0xe8) │ 7 │ +│ RECIPROCAL │ 5 │ +│ MOVE │ 4 │ +│ UNKNOWN(0x92) │ 4 │ +│ STREAM_SHUFFLE │ 4 │ +│ LOAD_MASK_SELECT │ 4 │ +│ IOTA │ 3 │ +│ ALU_OP │ 2 │ +│ UNKNOWN(0xe5) │ 2 │ +│ TENSOR_SCALAR │ 1 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +│ RNG │ 1 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 2332 │ +│ Scalar │ 8112 │ +│ Tensor │ 95421 │ +│ SyncDMA │ 0 │ +│ Vector │ 1565 │ +│ Sync │ 94 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [Codegen]: Total instructions: 107524 (0.00640893 GB) +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [Codegen]: Total DynamicDMA instruction count: 1035 +2025-08-07T13:54:35Z USER 49129 (sgLnk) [Codegen]: isa_gen finished after 0.410 seconds +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────────────┼────────────────┤ +│ qActSpillReload0_defId_0 │ 5120 │ +│ qActSpillReload0_defId_1 │ 2048 │ +│ qActSpillReload0_defId_2 │ 2476 │ +│ qDVESpillReload0_defId_2 │ 8 │ +│ qPoolIO0 │ 2 │ +│ qPoolPIOParam0 │ 72 │ +│ qPoolSpillReload0_defId_0 │ 10240 │ +│ qPoolSpillReload0_defId_1 │ 256 │ +│ qPoolSpillReload0_defId_2 │ 1030 │ +│ qSPIO0 │ 18442 │ +│ qSPSpillReload0_defId_0 │ 514 │ +│ qSPSpillReload0_defId_1 │ 768 │ +│ qSPSpillReload0_defId_2 │ 1054 │ +└───────────────────────────┴────────────────┘ + +Total descriptors: 42030 (0.000626296 GB) +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qPoolDynamic │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qSPIO0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qPoolIO0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +│ qPoolPIOParam0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [Codegen]: Tensors with largest descriptor count: +┌────────────────────────────┬──────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├────────────────────────────┼──────────┼──────────┼──────────────────┤ +│ dot.7-buffer-1531_sg0001 │ Internal │ bfloat16 │ 4 │ +│ dot.14-buffer-2748_sg0002 │ Internal │ bfloat16 │ 4 │ +│ 950.1676_i3_sg0000 │ Internal │ bfloat16 │ 4 │ +│ dot.4-buffer-1868_sg0000 │ Internal │ bfloat16 │ 4 │ +│ dot.11-buffer-1536_sg0001 │ Internal │ bfloat16 │ 4 │ +│ transpose.1_sg0000 │ Internal │ bfloat16 │ 16 │ +│ all-reduce.531.1544_sg0001 │ Internal │ bfloat16 │ 35 │ +│ add.4_sg0001 │ Internal │ bfloat16 │ 36 │ +│ all_gather.1_sg0000 │ Internal │ bfloat16 │ 64 │ +│ convert.59_sg0002 │ Internal │ float32 │ 599 │ +└────────────────────────────┴──────────┴──────────┴──────────────────┘ + +2025-08-07T13:54:35Z USER 49129 (sgLnk) [Codegen]: dma_desc_gen finished after 0.024 seconds +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [Codegen]: Estimated peak DRAM usage: 7.71581 GB +2025-08-07T13:54:35Z INFO 49129 (sgLnk) [Codegen]: Generating debug info +2025-08-07T13:54:35Z WARNING 49129 (sgLnk) [Codegen]: Found 127 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-08-07T13:54:35Z USER 49129 (sgLnk) [Codegen]: debug_info_gen finished after 0.146 seconds +2025-08-07T13:54:35Z USER 49129 [ModuleForkPass]: codegen finished after 0.597 seconds +2025-08-07T13:54:35Z INFO 49129 [ModuleForkPass]: curr_vmrss: 468mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49129 [ModuleForkPass]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 59216 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z USER 49129 [BackendPassManager]: mod_parallel_pass finished after 0.600 seconds +2025-08-07T13:54:35Z INFO 49129 [BackendPassManager]: curr_vmrss: 468mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49129 [BackendPassManager]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 59216 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z USER 49129 [BackendPassManager]: Running neff_packager +2025-08-07T13:54:35Z INFO 49129 [BackendPassManager]: Inputs to neff_packager: modules=1 functions=4 allocs=11556 blocks=4 instructions=59216 Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z INFO 49129 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.9-1124_CRSM.npy +2025-08-07T13:54:35Z INFO 49129 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_1405_CRSM.npy +2025-08-07T13:54:35Z INFO 49129 [NeffPackager]: FileDeDuper file not found value_sg0000_t1879_CRSM.npy +2025-08-07T13:54:35Z INFO 49129 [NeffPackager]: FileDeDuper file not found value_sg0001_identity_1144_CRSM.npy +2025-08-07T13:54:35Z INFO 49129 [NeffPackager]: FileDeDuper file not found value_sg0001_t1547_CRSM.npy +2025-08-07T13:54:35Z INFO 49129 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.24_CRSM.npy +2025-08-07T13:54:35Z INFO 49129 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.25_CRSM.npy +2025-08-07T13:54:35Z INFO 49129 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.26-809-913_CRSM.npy +2025-08-07T13:54:35Z INFO 49129 [NeffPackager]: FileDeDuper file not found value_sg0002_identity_1055_CRSM.npy +2025-08-07T13:54:35Z INFO 49129 [NeffPackager]: Const File de-dup saved 0 KB of memory footprint +2025-08-07T13:54:35Z WARNING 49129 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.169490.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-08-07T13:54:35Z INFO 49129 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.neff +2025-08-07T13:54:35Z INFO 49129 [NeffFileWriter]: IR signature: a8a3756d7053d8c4542e3fd51c392fbe for neff artifacts +2025-08-07T13:54:35Z USER 49129 [BackendPassManager]: neff_packager finished after 0.088 seconds +2025-08-07T13:54:35Z INFO 49129 [BackendPassManager]: curr_vmrss: 469mb, ru_maxrss: 716mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49129 [BackendPassManager]: Output has 1 module(s), 4 function(s), 11556 memory location(s), 4 block(s), and 59216 instruction(s). Max writers: 594 Max Readers: 19699 +2025-08-07T13:54:35Z INFO 49129 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ sg00 │ Peak scratchpad usage: local │ 0.003418 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: local │ 0.003418 GB │ +│ nc00 │ sg01 │ Peak scratchpad usage: local │ 0.004883 GB │ +│ nc00 │ sg01 │ Total size of allocated tensors: local │ 0.004883 GB │ +│ nc00 │ sg02 │ Peak scratchpad usage: local │ 0.002945 GB │ +│ nc00 │ sg02 │ Total size of allocated tensors: local │ 0.004242 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.004883 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.009853 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.070526 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.009853 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-08-07T13:54:35Z INFO 49129 [BackendDriver]: Backend completed successfully, tearing down. +2025-08-07T13:54:36Z INFO 47514 [job.WalrusDriver.0]: new_lnkState: {"model": ["/home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "walrus_bir.out.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "cached_wavegraph": "walrus_bir.out.json", "state_dir": "/home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/neuronxcc-vebk23i5/sgLnk/sg00", "state_id": "sgLnk"} +2025-08-07T13:54:36Z INFO 47514 [job.WalrusDriver.0]: MTBackend: completed successfully. +2025-08-07T13:54:36Z INFO 47514 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-08-07T13:54:36Z INFO 47514 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-08-07T13:54:36Z INFO 47514 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "walrus_bir.out.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "cached_wavegraph": "walrus_bir.out.json", "state_dir": "/home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/neuronxcc-vebk23i5/sgLnk/sg00", "state_id": "sgLnk"}' --pipeline BIRLinker +2025-08-07T13:54:36Z INFO 47514 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/neuronxcc-vebk23i5 +2025-08-07T13:54:36Z INFO 47514 [job.BIRLinker.0]: Linking already done. +2025-08-07T13:54:36Z INFO 47514 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-08-07T13:54:36Z INFO 47514 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-08-07T13:54:36Z INFO 47514 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-08-07T13:54:36Z INFO 47514 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-08-07T13:54:36Z INFO 47514 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-08-07T13:54:36Z INFO 47514 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-08-07T13:54:36Z INFO 47514 [job.NeffWrapper.0]: Processing input #0 +2025-08-07T13:54:36Z INFO 47514 [job.NeffWrapper.0]: Start NeffWrapper +2025-08-07T13:54:36Z INFO 47514 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb --neff /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.neff --io_transposes /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/neuronxcc-vebk23i5/io_transposes.json --output /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/context_encoding_model/_tp0_bk0/neuronxcc-vebk23i5/hlo_netlist.json +2025-08-07T13:54:36Z INFO 47514 [job.NeffWrapper.0]: There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-08-07T13:54:36Z INFO 47514 [job.NeffWrapper.0]: Job #0 finished +2025-08-07T13:54:36Z INFO 47514 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-08-07T13:54:36Z INFO 47514 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-08-07T13:54:36Z INFO 47514 [pipeline.Pipeline.0]: Job #0 finished +2025-08-07T13:54:36Z INFO 47449 [root]: Subcommand returned with exitcode=0 diff --git a/context_encoding_model/_tp0_bk0/metaneff.pb b/context_encoding_model/_tp0_bk0/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..4ec9acfda8ea84b1b61ec9ffe281d0dad634812d --- /dev/null +++ b/context_encoding_model/_tp0_bk0/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0aef68f833b52be82fd0e17410bcfd279e5719338cb746c0619d5139fc4a3d02 +size 1042690 diff --git a/context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb b/context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..4b97c33685baf379228c0544e6fcb789bc132334 --- /dev/null +++ b/context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d89b9e073981a0b1b7d0bbd0a24f147e9df13c5706d9d6be9971b857124c9496 +size 1119812 diff --git a/context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.neff b/context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.neff new file mode 100644 index 0000000000000000000000000000000000000000..1512ad540261d8bf04a2891e464dc116f454980e --- /dev/null +++ b/context_encoding_model/_tp0_bk0/model.MODULE_f4171003694760566af4+a9cd68fb.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0aeace703e08ac36bdcb2027d9a278403cb96ef39f48bddc999b077215e8a36 +size 1557504 diff --git a/context_encoding_model/_tp0_bk0/neuron_config.json b/context_encoding_model/_tp0_bk0/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9b3322dd87deb40b7cc80b8be7ce1fdd6a310b70 --- /dev/null +++ b/context_encoding_model/_tp0_bk0/neuron_config.json @@ -0,0 +1,220 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "Qwen/Qwen3-8B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 12288, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 36, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 1, + "bucket_n_active_tokens": true, + "buckets": [ + 128 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 2, + "chunked_prefill_config": null, + "context_encoding_buckets": [ + 128 + ], + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": true, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 1, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": false, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 1, + "lora_config": null, + "max_batch_size": 1, + "max_context_length": 1024, + "max_length": 1024, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 1024, + "n_positions": 1024, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 1024, + "pa_num_blocks": 1, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 1024, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 1, + "speculation_length": 0, + "start_rank_id": 0, + "target": null, + "tile_cc": false, + "tkg_batch_size": 1, + "token_generation_buckets": null, + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 32, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/context_encoding_model/_tp0_bk1/command.txt b/context_encoding_model/_tp0_bk1/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..514d492ce1800a9d4e460137a9d5a2d0da611d1b --- /dev/null +++ b/context_encoding_model/_tp0_bk1/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb --output model.MODULE_2914133a46cb7b4660ab+d7af8a84.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk1/compile_flags.MODULE_2914133a46cb7b4660ab+d7af8a84.json b/context_encoding_model/_tp0_bk1/compile_flags.MODULE_2914133a46cb7b4660ab+d7af8a84.json new file mode 100644 index 0000000000000000000000000000000000000000..fff1c21954eec31f812b4884ff39e0411f3b87be --- /dev/null +++ b/context_encoding_model/_tp0_bk1/compile_flags.MODULE_2914133a46cb7b4660ab+d7af8a84.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=1", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/log-neuron-cc.txt"] \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk1/global_metric_store.json b/context_encoding_model/_tp0_bk1/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..b089ffb9e2a389bccef7c21ae8c515ab5a51f2a7 --- /dev/null +++ b/context_encoding_model/_tp0_bk1/global_metric_store.json @@ -0,0 +1,1079 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 99.65782165527344, + "StaticProfiler::AveragePartitionUtilization": 97.58238220214844, + "StaticProfiler::AveragePeUtilization": 98.61824035644531, + "StaticProfiler::LocalizationEfficiency": 98.78419494628906, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 100.47209167480469, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1.0, + "StaticProfiler::AveragePartitionUtilization": 1.0, + "StaticProfiler::AveragePeUtilization": 1.0, + "StaticProfiler::LocalizationEfficiency": 1.0, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 0.019578933715820313, + "AffinePredicateResolution": 0.0019481182098388672, + "AliasDependencyElimination": 0.0001239776611328125, + "AliasDependencyInduction": 0.00577092170715332, + "AliasDependencyReset": 0.027690649032592773, + "BFComputeCutting": 0.0023322105407714844, + "BirCodeGenLoop": 0.4628438949584961, + "CCOpFusion": 0.022275209426879883, + "CanonicalizeConv": 3.300000025774352e-05, + "CanonicalizeDAGForPGTiling": 0.005593061447143555, + "CanonicalizeForTensorizer": 4.400000034365803e-05, + "CanonicalizeIR": 0.001634359359741211, + "Canonicalizer": 0.0008999999845400453, + "CoalesceCCOp": 0.015577077865600586, + "CommuteConcat": 0.0008616447448730469, + "DMALocalityOpt": 0.007327079772949219, + "DMAProfiler": 0.012569665908813477, + "DMATilingProfiler": 0.0037431716918945313, + "DataLocalityOpt": 0.06741714477539063, + "DataStreaming": 0.03615880012512207, + "DeConcat": 0.0005049705505371094, + "DeadCodeElimination": 0.0009002685546875, + "DeadStoreElimination": 0.0056514739990234375, + "DelinearIndices": 0.004773139953613281, + "Delinearization": 0.0026137828826904297, + "DoNothing": 0.0001933574676513672, + "DramToDramTranspose": 0.019293546676635742, + "DumpGraphAndMetadata": 0.10360383987426758, + "EliminateDivs": 0.003831148147583008, + "ExpandBatchNorm": 0.0019576549530029297, + "ExpandISAMacro": 0.012068033218383789, + "FactorizeBlkDims": 0.008942604064941406, + "FactorizeThreadAxesInFreeDims": 0.001847982406616211, + "FlattenMacroLoop": 0.003529787063598633, + "GenericAccessSimplifier": 0.0008223056793212891, + "HoistCompute": 7.999999979801942e-06, + "IdentifyCrossPassTensors": 4.8000001697801054e-05, + "InferInitValue": 0.025947093963623047, + "InferIntrinsicOnCC": 0.00908350944519043, + "InferNeuronTensor": 0.02371978759765625, + "InferNonlocalTensors": 0.014753341674804688, + "InferPSumTensor": 0.309035062789917, + "InlineNativeKernels": 0.008690595626831055, + "InsertIOTransposes": 0.01906275749206543, + "InsertLocalTransposes": 0.004312276840209961, + "InsertOffloadedTransposes": 0.002802133560180664, + "LICM": 0.003081083297729492, + "LateLegalizeInst": 0.014100313186645508, + "LateLegalizePostSplit": 0.012533903121948242, + "LateLowerReshapeOp": 0.001035451889038086, + "LateLowerTensorOp": 0.002605438232421875, + "LateNeuronInstComb": 0.009373188018798828, + "LayoutPreprocessing": 0.03434133529663086, + "LayoutPreprocessingAndAnalysis": 0.07319903373718262, + "LayoutRequirementAnalysis": 0.005194187164306641, + "LegalizeCCOpLayout": 0.0025322437286376953, + "LegalizeOpLevelAlias": 0.0020308494567871094, + "LegalizePartitionReduce": 0.0010001659393310547, + "LegalizeSundaAccess": 0.0786747932434082, + "LegalizeSundaMacro": 0.011176109313964844, + "LegalizeType": 0.014636754989624023, + "LocalLayoutOpt": 0.014019250869750977, + "LoopFusion": 0.005472898483276367, + "LoopSplitting": 0.00038623809814453125, + "LowerBroadcast": 0.0027265548706054688, + "LowerCCOpBlockAxis": 0.0058476924896240234, + "LowerComplexBroadcast": 0.00213623046875, + "LowerIntrinsics": 0.3070671558380127, + "LowerTensorOp": 0.010679960250854492, + "LowerTranspose": 0.012553691864013672, + "MacroGeneration": 0.029733657836914063, + "MaskPropagation": 0.0028328895568847656, + "MemcastMotion": 1.8999999156221747e-05, + "MemcpyElimination": 0.026583433151245117, + "MutateDataType": 0.0020093917846679688, + "NeuronAliasDependencyInduction": 0.00018548965454101563, + "NeuronAliasDependencyReset": 0.02524423599243164, + "NeuronInstComb": 0.004286766052246094, + "NeuronLICM": 0.03554058074951172, + "NeuronLoopFusion": 0.007987260818481445, + "NeuronLoopInterchange": 0.0023233890533447266, + "NeuronSimplifier": 0.0075054168701171875, + "NeuronSimplifyPredicates": 0.12207841873168945, + "NeuronValueNumbering": 0.0038213729858398438, + "OptimizeAliasedCopyChain": 0.0005936622619628906, + "OptimizeNKIKernels": 0.44962644577026367, + "PAGLayoutOpt": 0.0999138355255127, + "PComputeCutting": 0.005170106887817383, + "PGLayoutTilingPipeline": 0.7408750057220459, + "PGTiling": 0.29245758056640625, + "PadElimination": 0.000308990478515625, + "ParAxesAnnotation": 0.05283546447753906, + "PartialLoopFusion": 0.0043125152587890625, + "PartialSimdFusion": 0.004901885986328125, + "PenguinizeFunctions": 4.3000000005122274e-05, + "PerfectLoopNest": 0.001722574234008789, + "PruneFunctions": 4.199999966658652e-05, + "RecognizeOpIdiom": 0.004076480865478516, + "Recompute": 0.0002620220184326172, + "RelaxPredicates": 0.013286828994750977, + "Rematerialization": 0.0021238327026367188, + "RemoveOptimizationBarriers": 4.3000000005122274e-05, + "ReshapeWeights": 0.0006799697875976563, + "ResolveAccessConflict": 0.0040090084075927734, + "ResolveComplicatePredicates": 0.001981496810913086, + "RewriteReplicationMatmul": 0.0021796226501464844, + "RewriteWeights": 0.0022602081298828125, + "SFKVectorizer": 0.274188756942749, + "ScatterMotion": 5.7999997807201e-05, + "SimpleAllReduceTiling": 0.009164094924926758, + "Simplifier": 0.0046122074127197266, + "SimplifyMacroPredicates": 0.010458230972290039, + "SimplifyNeuronTensor": 1.0516629219055176, + "SimplifySlice": 0.0009145736694335938, + "SimplifyTensor": 0.00577855110168457, + "SpillPSum": 0.012692689895629883, + "SplitAPUnionSets": 0.10518908500671387, + "SplitAccGrp": 0.001172780990600586, + "StaticProfiler": 0.0124053955078125, + "StaticTransposeLocalTensor": 0.0038576126098632813, + "SundaISel": 0.04396390914916992, + "TCTransform": 0.0018804073333740234, + "TensorInitialization": 0.012793779373168945, + "TensorOpSimplifier": 0.0045316219329833984, + "TensorOpTransform": 0.021115541458129883, + "TensorizerLegalizationPass": 6.999999459367245e-05, + "TileCCOps": 0.0056231021881103516, + "TilingProfiler": 0.00790858268737793, + "TransformConvOp": 0.0030431747436523438, + "TritiumFusion": 0.03186154365539551, + "ValueNumbering": 0.0038623809814453125, + "VectorizeDMA": 0.0021522045135498047, + "VectorizeMatMult": 0.003453969955444336, + "VerifySupportedOps": 3.300000025774352e-05, + "WeightCoalescing": 0.009244203567504883, + "ZeroSizeTensorElimination": 0.00011420249938964844, + "algsimp": 0.0026100000832229853, + "batchnorm_expander": 3.9999998989515007e-05, + "boundary-marker-removal": 1.2000000424450263e-05, + "call-inliner": 0.00046499999007210135, + "canonicalize-boundary-marker": 1.8000000636675395e-05, + "collective-stream-id-checker": 9.200000204145908e-05, + "comparison-expander": 0.0005959999980404973, + "computation-deduplicator": 6.900000153109431e-05, + "conditional-to-select": 1.700000029813964e-05, + "config-lowering": 7.79999973019585e-05, + "constant-statistics": 0.0005530000198632479, + "constant_folding": 0.0003320000250823796, + "cse": 3.7000001611886546e-05, + "dce": 7.800000457791612e-05, + "dot_decomposer": 0.0014440000522881746, + "dynamic-slice-transpose": 1.2000000424450263e-05, + "eliminate-redundant-compare": 0.00028100001509301364, + "emit-offloaded-dropout": 4.099999932805076e-05, + "flatten-call-graph": 0.0009379999246448278, + "fuse-send-recv": 7.200000254670158e-05, + "hilo::LegalizeAlias": 1.2999999853491317e-05, + "hilo::NeuronInstCombine": 0.00010099999781232327, + "hilo::NeuronOpFusion": 2.5000001187436283e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 4.3000000005122274e-05, + "hilo::ScheduleFusion": 1.9999999949504854e-06, + "hilo::SixtyFourHack": 8.900000102585182e-05, + "hilo::VerifyAliasing": 4.999999873689376e-06, + "hlo-mac-count": 0.0012799999676644802, + "hlo-verifier": 0.007751000113785267, + "instruction-histogram": 0.0006590000120922923, + "io-con-pipe-begin": 6.000000212225132e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0014029999729245901, + "io-statistics": 6.199999916134402e-05, + "legalize-ccops": 3.999999989900971e-06, + "legalize-compare": 1.1000000085914508e-05, + "lower-argminmax-custom-call": 1.2000000424450263e-05, + "map-inline": 0.0008909999742172658, + "metadata-naming": 5.7999997807201e-05, + "mlir::detail::OpToOpPassAdaptor": 0.00016799999866634607, + "mlir::hlo::MhloToPyPenguin": 0.0028260000981390476, + "mlir::mhlo::LowerComplexExtraPass": 0.00026000000070780516, + "mlir::mhlo::LowerComplexPass": 0.0002699999895412475, + "native-to-custom-softmax": 0.0007219999679364264, + "native-to-custom-softmax-dx": 0.0005740000051446259, + "operand_upcaster": 6.399999983841553e-05, + "opt-barrier-removal": 0.0005649999948218465, + "post-par-pipe-begin": 9.600000339560211e-05, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0016929999692365527, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.1934960037469864, + "replace-minimum-constant": 0.00044299999717622995, + "reshape-mover": 0.00010800000018207356, + "simplify-concat": 0.00014099999680183828, + "simplify-while-loops": 9.600000339560211e-05, + "transform-variadic-reduce": 7.900000491645187e-05, + "tuple-simplifier": 0.0002980000281240791, + "unpack-nested-aws-ntwsr": 0.0004720000142697245, + "unroll-while-loop": 1.8999999156221747e-05, + "zero_sized_hlo_elimination": 0.0008989999769255519 + }, + "hilo": { + "ConstantSize": 599333.0, + "HloInputCount": 475.0, + "HloMacCount": 50240159744.0, + "HloOutputCount": 73.0, + "IfmapSize": 8266543104.0, + "OfmapSize": 75497472.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 1663506816.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 43318.0, + "StaticProfiler::AifUb": 154.8094024658203, + "StaticProfiler::ArithmeticIntensityTensorizer": 152.92723083496094, + "StaticProfiler::AverageDmaLength": 4809.89794921875, + "StaticProfiler::DDRTransferBytes": 787141440.0, + "StaticProfiler::InternalTransferBytes": 634853888.0, + "StaticProfiler::LoadExpanded": 98070.0, + "StaticProfiler::StoreExpanded": 2397.0, + "StaticProfiler::TotalDMAExpanded": 100467.0, + "StaticProfiler::TotalDynamicInstancesCount": 50670.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 50224.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 22848.0, + "TilingProfiler::NumPfTransposes": 5.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 19201.0, + "TilingProfiler::PfTransposeInstructionsForIo": 19008.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 192.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 4.0, + "TilingProfiler::SimdInstructionsAfterTiling": 158.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "all": { + "compiletime": { + "algsimp": 0.0024220000486820936, + "call-inliner": 0.0004349999944679439, + "collective-stream-id-checker": 8.199999865610152e-05, + "comparison-expander": 0.0005810000002384186, + "constant-statistics": 0.0005530000198632479, + "constant_folding": 0.0003060000017285347, + "dce": 7.500000356230885e-05, + "dot_decomposer": 0.0014440000522881746, + "eliminate-redundant-compare": 0.0002690000110305846, + "flatten-call-graph": 0.0009069999796338379, + "hlo-mac-count": 0.0010560000082477927, + "hlo-verifier": 0.007164000067859888, + "instruction-histogram": 0.0006590000120922923, + "io-con-pipe-begin": 6.000000212225132e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0014029999729245901, + "io-statistics": 6.199999916134402e-05, + "map-inline": 0.0008549999911338091, + "native-to-custom-softmax": 0.0007029999978840351, + "native-to-custom-softmax-dx": 0.000522000016644597, + "opt-barrier-removal": 0.0005649999948218465, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.1934960037469864, + "replace-minimum-constant": 0.00042100000428035855, + "reshape-mover": 9.7999996796716e-05, + "simplify-while-loops": 9.000000136438757e-05, + "tuple-simplifier": 0.00028300000121816993, + "unpack-nested-aws-ntwsr": 0.0004600000102072954, + "unroll-while-loop": 1.8999999156221747e-05, + "zero_sized_hlo_elimination": 0.0008989999769255519 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.0002090930938720703, + "DMALocalityOpt": 0.00018835067749023438, + "DMAProfiler": 0.0008924007415771484, + "DataStreaming": 0.0002593994140625, + "DoNothing": 0.00011873245239257813, + "ExpandISAMacro": 0.0005505084991455078, + "FactorizeBlkDims": 0.0004696846008300781, + "InferPSumTensor": 0.0004990100860595703, + "LateLegalizeInst": 0.0004222393035888672, + "LateNeuronInstComb": 0.0005340576171875, + "LegalizeSundaAccess": 0.0017271041870117188, + "LegalizeType": 0.0002815723419189453, + "LowerBroadcast": 0.0002243518829345703, + "LowerIntrinsics": 0.0002181529998779297, + "LowerTranspose": 0.00024199485778808594, + "NeuronInstComb": 0.0004971027374267578, + "NeuronLICM": 0.0004258155822753906, + "NeuronSimplifyPredicates": 0.002941608428955078, + "NeuronValueNumbering": 0.0004222393035888672, + "SFKVectorizer": 0.002941131591796875, + "SimpleAllReduceTiling": 0.00019812583923339844, + "SimplifyNeuronTensor": 0.00045800209045410156, + "SpillPSum": 0.0005657672882080078, + "WeightCoalescing": 0.00020837783813476563 + } + }, + "sg00": { + "compiletime": { + "CanonicalizeConv": 3.099999958067201e-05, + "CanonicalizeForTensorizer": 1.5999999959603883e-05, + "Canonicalizer": 0.00032900000223889947, + "HoistCompute": 3.000000106112566e-06, + "IdentifyCrossPassTensors": 2.8000000384054147e-05, + "MemcastMotion": 1.1000000085914508e-05, + "PenguinizeFunctions": 1.5999999959603883e-05, + "PruneFunctions": 1.2999999853491317e-05, + "RemoveOptimizationBarriers": 2.300000051036477e-05, + "ScatterMotion": 1.9999999494757503e-05, + "TensorizerLegalizationPass": 4.3000000005122274e-05, + "VerifySupportedOps": 1.2000000424450263e-05, + "algsimp": 6.600000051548705e-05, + "batchnorm_expander": 1.4000000192027073e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 9.000000318337698e-06, + "canonicalize-boundary-marker": 6.000000212225132e-06, + "collective-stream-id-checker": 3.000000106112566e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 1.9999999494757503e-05, + "conditional-to-select": 4.999999873689376e-06, + "config-lowering": 3.099999958067201e-05, + "constant_folding": 9.000000318337698e-06, + "cse": 1.2999999853491317e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.999999989900971e-06, + "emit-offloaded-dropout": 1.2999999853491317e-05, + "flatten-call-graph": 9.999999747378752e-06, + "fuse-send-recv": 2.499999936844688e-05, + "hilo::LegalizeAlias": 6.000000212225132e-06, + "hilo::NeuronInstCombine": 4.3000000005122274e-05, + "hilo::NeuronOpFusion": 9.000000318337698e-06, + "hilo::ReplaceTokenTypeWithU8Pass": 1.5999999959603883e-05, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 1.4999999621068127e-05, + "hilo::VerifyAliasing": 1.9999999949504854e-06, + "hlo-mac-count": 2.5999999706982635e-05, + "hlo-verifier": 0.0001939999929163605, + "legalize-ccops": 1.9999999949504854e-06, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.999999989900971e-06, + "map-inline": 1.2000000424450263e-05, + "metadata-naming": 1.8000000636675395e-05, + "mlir::detail::OpToOpPassAdaptor": 1.9999999494757503e-05, + "mlir::hlo::MhloToPyPenguin": 0.0009980000322684646, + "mlir::mhlo::LowerComplexExtraPass": 8.800000068731606e-05, + "mlir::mhlo::LowerComplexPass": 0.00015999999595806003, + "native-to-custom-softmax": 7.000000096013537e-06, + "native-to-custom-softmax-dx": 1.2999999853491317e-05, + "operand_upcaster": 1.9999999494757503e-05, + "post-par-pipe-begin": 8.900000102585182e-05, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.000582000007852912, + "replace-minimum-constant": 7.000000096013537e-06, + "reshape-mover": 3.999999989900971e-06, + "simplify-concat": 4.8000001697801054e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 9.000000318337698e-06, + "tuple-simplifier": 4.999999873689376e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 16.6773738861084, + "ConstantSize": 599333.0, + "HloInputCount": 475.0, + "HloMacCount": 5637144576.0, + "HloOutputCount": 73.0, + "IfmapSize": 8266543104.0, + "OfmapSize": 75497472.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 676023104.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 0.08161520957946777, + "AffinePredicateResolution": 0.001527547836303711, + "AliasDependencyElimination": 0.00012493133544921875, + "AliasDependencyInduction": 0.008615970611572266, + "AliasDependencyReset": 0.03425288200378418, + "BFComputeCutting": 0.003037691116333008, + "BirCodeGenLoop": 0.05175900459289551, + "CCOpFusion": 0.024791479110717773, + "CanonicalizeDAGForPGTiling": 0.003105640411376953, + "CanonicalizeIR": 0.0020570755004882813, + "CoalesceCCOp": 0.005420684814453125, + "CommuteConcat": 0.0015554428100585938, + "DMALocalityOpt": 0.0025992393493652344, + "DMAProfiler": 0.004426240921020508, + "DMATilingProfiler": 0.00414586067199707, + "DataLocalityOpt": 0.11810016632080078, + "DataStreaming": 0.0053942203521728516, + "DeConcat": 0.0011267662048339844, + "DeadCodeElimination": 0.0016050338745117188, + "DeadStoreElimination": 0.030996084213256836, + "DelinearIndices": 0.007958412170410156, + "Delinearization": 0.003355741500854492, + "DoNothing": 7.987022399902344e-05, + "DramToDramTranspose": 0.03346753120422363, + "DumpGraphAndMetadata": 0.005443096160888672, + "EliminateDivs": 0.004342555999755859, + "ExpandBatchNorm": 0.0018055438995361328, + "ExpandISAMacro": 0.003648519515991211, + "FactorizeBlkDims": 0.019720077514648438, + "FactorizeThreadAxesInFreeDims": 0.0019965171813964844, + "FlattenMacroLoop": 0.003274679183959961, + "GenericAccessSimplifier": 0.0009877681732177734, + "InferInitValue": 0.032111167907714844, + "InferIntrinsicOnCC": 0.014227867126464844, + "InferNeuronTensor": 0.04684329032897949, + "InferNonlocalTensors": 0.10579586029052734, + "InferPSumTensor": 0.04808926582336426, + "InlineNativeKernels": 0.0025835037231445313, + "InsertIOTransposes": 0.012038707733154297, + "InsertLocalTransposes": 0.007574796676635742, + "InsertOffloadedTransposes": 0.003882884979248047, + "LICM": 0.003116607666015625, + "LateLegalizeInst": 0.006630420684814453, + "LateLegalizePostSplit": 0.0030584335327148438, + "LateLowerReshapeOp": 0.002176046371459961, + "LateLowerTensorOp": 0.005063295364379883, + "LateNeuronInstComb": 0.024392366409301758, + "LayoutPreprocessing": 0.03173065185546875, + "LayoutPreprocessingAndAnalysis": 0.07484269142150879, + "LayoutRequirementAnalysis": 0.007186174392700195, + "LegalizeCCOpLayout": 0.003088235855102539, + "LegalizeOpLevelAlias": 0.0011813640594482422, + "LegalizePartitionReduce": 0.0013763904571533203, + "LegalizeSundaAccess": 0.04270172119140625, + "LegalizeSundaMacro": 0.009444236755371094, + "LegalizeType": 0.004534721374511719, + "LocalLayoutOpt": 0.01777815818786621, + "LoopFusion": 0.0060007572174072266, + "LoopSplitting": 0.000377655029296875, + "LowerBroadcast": 0.0016138553619384766, + "LowerCCOpBlockAxis": 0.004978179931640625, + "LowerComplexBroadcast": 0.0023903846740722656, + "LowerIntrinsics": 0.034012556076049805, + "LowerTensorOp": 0.01333928108215332, + "LowerTranspose": 0.011911869049072266, + "MacroGeneration": 0.07152104377746582, + "MaskPropagation": 0.004988193511962891, + "MemcpyElimination": 0.11162376403808594, + "MutateDataType": 0.0014476776123046875, + "NeuronAliasDependencyInduction": 0.0002269744873046875, + "NeuronAliasDependencyReset": 0.15035724639892578, + "NeuronInstComb": 0.015686750411987305, + "NeuronLICM": 0.011453866958618164, + "NeuronLoopFusion": 0.018696069717407227, + "NeuronLoopInterchange": 0.0018415451049804688, + "NeuronSimplifier": 0.011624336242675781, + "NeuronSimplifyPredicates": 0.005795955657958984, + "NeuronValueNumbering": 0.0040967464447021484, + "OptimizeAliasedCopyChain": 0.0014064311981201172, + "OptimizeNKIKernels": 0.0021300315856933594, + "PAGLayoutOpt": 0.33215951919555664, + "PComputeCutting": 0.008408308029174805, + "PGLayoutTilingPipeline": 1.3294909000396729, + "PGTiling": 0.3412203788757324, + "PadElimination": 0.0018661022186279297, + "ParAxesAnnotation": 0.29718852043151855, + "PartialLoopFusion": 0.024113893508911133, + "PartialSimdFusion": 0.029590368270874023, + "PerfectLoopNest": 0.0021219253540039063, + "RecognizeOpIdiom": 0.004444122314453125, + "Recompute": 0.00028204917907714844, + "RelaxPredicates": 0.004793405532836914, + "Rematerialization": 0.004267692565917969, + "ReshapeWeights": 0.0014717578887939453, + "ResolveAccessConflict": 0.0038602352142333984, + "ResolveComplicatePredicates": 0.001505136489868164, + "RewriteReplicationMatmul": 0.0020885467529296875, + "RewriteWeights": 0.003512144088745117, + "SFKVectorizer": 0.3296499252319336, + "SimpleAllReduceTiling": 0.002294301986694336, + "Simplifier": 0.004443168640136719, + "SimplifyMacroPredicates": 0.013223648071289063, + "SimplifyNeuronTensor": 0.011357307434082031, + "SimplifySlice": 0.0010068416595458984, + "SimplifyTensor": 0.006380319595336914, + "SpillPSum": 0.018645763397216797, + "SplitAPUnionSets": 0.031983375549316406, + "SplitAccGrp": 0.0017464160919189453, + "StaticProfiler": 0.004789590835571289, + "StaticTransposeLocalTensor": 0.0048563480377197266, + "SundaISel": 0.046004533767700195, + "TCTransform": 0.0017864704132080078, + "TensorInitialization": 0.015267372131347656, + "TensorOpSimplifier": 0.006502866744995117, + "TensorOpTransform": 0.029101848602294922, + "TileCCOps": 0.0055658817291259766, + "TilingProfiler": 0.014283895492553711, + "TransformConvOp": 0.0028002262115478516, + "TritiumFusion": 0.037850379943847656, + "ValueNumbering": 0.002534627914428711, + "VectorizeDMA": 0.0056002140045166016, + "VectorizeMatMult": 0.004069805145263672, + "WeightCoalescing": 0.0033059120178222656, + "ZeroSizeTensorElimination": 0.00012040138244628906 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 1945.0, + "StaticProfiler::AifUb": 18.54642677307129, + "StaticProfiler::ArithmeticIntensityTensorizer": 234.4757080078125, + "StaticProfiler::AverageDmaLength": 3607.790283203125, + "StaticProfiler::AverageFractalPeUtilization": 99.84349822998047, + "StaticProfiler::AveragePartitionUtilization": 96.70350646972656, + "StaticProfiler::AveragePeUtilization": 99.51932525634766, + "StaticProfiler::DDRTransferBytes": 53226752.0, + "StaticProfiler::InternalTransferBytes": 27462656.0, + "StaticProfiler::LoadExpanded": 10244.0, + "StaticProfiler::LocalizationEfficiency": 1264.2635498046875, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1466.4949951171875, + "StaticProfiler::StoreExpanded": 3713.0, + "StaticProfiler::TotalDMAExpanded": 13957.0, + "StaticProfiler::TotalDynamicInstancesCount": 2107.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 2103.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 24.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 1010.0, + "TilingProfiler::NumPfTransposes": 6.0, + "TilingProfiler::NumPfTransposesForIo": 0.0, + "TilingProfiler::NumPfTransposesForLocal": 5.0, + "TilingProfiler::NumPfTransposesForNonlocal": 1.0, + "TilingProfiler::PfTransposeInstructions": 176.0, + "TilingProfiler::PfTransposeInstructionsForIo": 0.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 144.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 32.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 177.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0001": { + "compiletime": { + "AGOrderingAnalysisPass": 0.035902976989746094, + "AffinePredicateResolution": 0.0021402835845947266, + "AliasDependencyElimination": 0.0001494884490966797, + "AliasDependencyInduction": 0.00843667984008789, + "AliasDependencyReset": 0.07715225219726563, + "BFComputeCutting": 0.002821207046508789, + "BirCodeGenLoop": 0.03489971160888672, + "CCOpFusion": 0.03669166564941406, + "CanonicalizeDAGForPGTiling": 0.0034394264221191406, + "CanonicalizeIR": 0.001888275146484375, + "CoalesceCCOp": 0.0048944950103759766, + "CommuteConcat": 0.001985788345336914, + "DMALocalityOpt": 0.0010595321655273438, + "DMAProfiler": 0.0038537979125976563, + "DMATilingProfiler": 0.0052776336669921875, + "DataLocalityOpt": 0.13663840293884277, + "DataStreaming": 0.004033327102661133, + "DeConcat": 0.0017592906951904297, + "DeadCodeElimination": 0.0027074813842773438, + "DeadStoreElimination": 0.03486442565917969, + "DelinearIndices": 0.010581493377685547, + "Delinearization": 0.004877567291259766, + "DoNothing": 6.914138793945313e-05, + "DramToDramTranspose": 0.03982400894165039, + "DumpGraphAndMetadata": 0.004088640213012695, + "EliminateDivs": 0.0045583248138427734, + "ExpandBatchNorm": 0.0018122196197509766, + "ExpandISAMacro": 0.0023725032806396484, + "FactorizeBlkDims": 0.013248920440673828, + "FactorizeThreadAxesInFreeDims": 0.0023849010467529297, + "FlattenMacroLoop": 0.0036728382110595703, + "GenericAccessSimplifier": 0.0026085376739501953, + "InferInitValue": 0.038416147232055664, + "InferIntrinsicOnCC": 0.010096549987792969, + "InferNeuronTensor": 0.05150651931762695, + "InferNonlocalTensors": 0.031507015228271484, + "InferPSumTensor": 0.03166079521179199, + "InlineNativeKernels": 0.0021262168884277344, + "InsertIOTransposes": 0.022419452667236328, + "InsertLocalTransposes": 0.0071408748626708984, + "InsertOffloadedTransposes": 0.0034465789794921875, + "LICM": 0.004317283630371094, + "LateLegalizeInst": 0.004563570022583008, + "LateLegalizePostSplit": 0.0027570724487304688, + "LateLowerReshapeOp": 0.0013232231140136719, + "LateLowerTensorOp": 0.004618406295776367, + "LateNeuronInstComb": 0.020873546600341797, + "LayoutPreprocessing": 0.037287235260009766, + "LayoutPreprocessingAndAnalysis": 0.10860347747802734, + "LayoutRequirementAnalysis": 0.007799863815307617, + "LegalizeCCOpLayout": 0.001935720443725586, + "LegalizeOpLevelAlias": 0.0012698173522949219, + "LegalizePartitionReduce": 0.002346515655517578, + "LegalizeSundaAccess": 0.016484975814819336, + "LegalizeSundaMacro": 0.011503934860229492, + "LegalizeType": 0.0047261714935302734, + "LocalLayoutOpt": 0.02424001693725586, + "LoopFusion": 0.007829427719116211, + "LoopSplitting": 0.00044846534729003906, + "LowerBroadcast": 0.0014789104461669922, + "LowerCCOpBlockAxis": 0.0059947967529296875, + "LowerComplexBroadcast": 0.0023598670959472656, + "LowerIntrinsics": 0.035590410232543945, + "LowerTensorOp": 0.012118339538574219, + "LowerTranspose": 0.011335611343383789, + "MacroGeneration": 0.11938071250915527, + "MaskPropagation": 0.003367900848388672, + "MemcpyElimination": 0.10591435432434082, + "MutateDataType": 0.002183198928833008, + "NeuronAliasDependencyInduction": 0.0002372264862060547, + "NeuronAliasDependencyReset": 0.02314162254333496, + "NeuronInstComb": 0.01471090316772461, + "NeuronLICM": 0.007970094680786133, + "NeuronLoopFusion": 0.022555112838745117, + "NeuronLoopInterchange": 0.0015497207641601563, + "NeuronSimplifier": 0.012836694717407227, + "NeuronSimplifyPredicates": 0.001605987548828125, + "NeuronValueNumbering": 0.0046231746673583984, + "OptimizeAliasedCopyChain": 0.00162506103515625, + "OptimizeNKIKernels": 0.0015685558319091797, + "PAGLayoutOpt": 0.14427471160888672, + "PComputeCutting": 0.00727081298828125, + "PGLayoutTilingPipeline": 1.2423913478851318, + "PGTiling": 0.5181164741516113, + "PadElimination": 0.00038051605224609375, + "ParAxesAnnotation": 0.09470343589782715, + "PartialLoopFusion": 0.018784761428833008, + "PartialSimdFusion": 0.027338027954101563, + "PerfectLoopNest": 0.0021829605102539063, + "RecognizeOpIdiom": 0.0048656463623046875, + "Recompute": 0.0002601146697998047, + "RelaxPredicates": 0.0033593177795410156, + "Rematerialization": 0.0023822784423828125, + "ReshapeWeights": 0.0014538764953613281, + "ResolveAccessConflict": 0.0047032833099365234, + "ResolveComplicatePredicates": 0.0019354820251464844, + "RewriteReplicationMatmul": 0.002605438232421875, + "RewriteWeights": 0.004354715347290039, + "SFKVectorizer": 0.16805624961853027, + "SimpleAllReduceTiling": 0.0025529861450195313, + "Simplifier": 0.00439763069152832, + "SimplifyMacroPredicates": 0.007683992385864258, + "SimplifyNeuronTensor": 0.0066149234771728516, + "SimplifySlice": 0.0023670196533203125, + "SimplifyTensor": 0.0063228607177734375, + "SpillPSum": 0.01709151268005371, + "SplitAPUnionSets": 0.018975019454956055, + "SplitAccGrp": 0.002074003219604492, + "StaticProfiler": 0.0037796497344970703, + "StaticTransposeLocalTensor": 0.005953311920166016, + "SundaISel": 0.0426335334777832, + "TCTransform": 0.0011513233184814453, + "TensorInitialization": 0.002532958984375, + "TensorOpSimplifier": 0.006600379943847656, + "TensorOpTransform": 0.034122467041015625, + "TileCCOps": 0.0059397220611572266, + "TilingProfiler": 0.013670921325683594, + "TransformConvOp": 0.002622365951538086, + "TritiumFusion": 0.05379676818847656, + "ValueNumbering": 0.0030698776245117188, + "VectorizeDMA": 0.0016117095947265625, + "VectorizeMatMult": 0.005866289138793945, + "WeightCoalescing": 0.0026290416717529297, + "ZeroSizeTensorElimination": 0.00011897087097167969 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 6049.0, + "StaticProfiler::AifUb": 251.7889862060547, + "StaticProfiler::ArithmeticIntensityTensorizer": 253.54466247558594, + "StaticProfiler::AverageDmaLength": 6385.9599609375, + "StaticProfiler::AverageFractalPeUtilization": 100.0, + "StaticProfiler::AveragePartitionUtilization": 99.86996459960938, + "StaticProfiler::AveragePeUtilization": 100.0, + "StaticProfiler::DDRTransferBytes": 204350464.0, + "StaticProfiler::InternalTransferBytes": 21430272.0, + "StaticProfiler::LoadExpanded": 27520.0, + "StaticProfiler::LocalizationEfficiency": 100.69728088378906, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 105.00786590576172, + "StaticProfiler::StoreExpanded": 2305.0, + "StaticProfiler::TotalDMAExpanded": 29825.0, + "StaticProfiler::TotalDynamicInstancesCount": 6153.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 6153.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 16.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 4848.0, + "TilingProfiler::NumPfTransposes": 8.0, + "TilingProfiler::NumPfTransposesForIo": 3.0, + "TilingProfiler::NumPfTransposesForLocal": 3.0, + "TilingProfiler::NumPfTransposesForNonlocal": 2.0, + "TilingProfiler::PfTransposeInstructions": 276.0, + "TilingProfiler::PfTransposeInstructionsForIo": 68.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 80.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 128.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 216.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0002": { + "compiletime": { + "AGOrderingAnalysisPass": 0.019578933715820313, + "AffinePredicateResolution": 0.0019481182098388672, + "AliasDependencyElimination": 0.0001239776611328125, + "AliasDependencyInduction": 0.00577092170715332, + "AliasDependencyReset": 0.027690649032592773, + "BFComputeCutting": 0.0023322105407714844, + "BirCodeGenLoop": 0.4628438949584961, + "CCOpFusion": 0.022275209426879883, + "CanonicalizeDAGForPGTiling": 0.005593061447143555, + "CanonicalizeIR": 0.001634359359741211, + "CoalesceCCOp": 0.015367984771728516, + "CommuteConcat": 0.0008616447448730469, + "DMALocalityOpt": 0.007138729095458984, + "DMAProfiler": 0.011677265167236328, + "DMATilingProfiler": 0.0037431716918945313, + "DataLocalityOpt": 0.06741714477539063, + "DataStreaming": 0.03589940071105957, + "DeConcat": 0.0005049705505371094, + "DeadCodeElimination": 0.0009002685546875, + "DeadStoreElimination": 0.0056514739990234375, + "DelinearIndices": 0.004773139953613281, + "Delinearization": 0.0026137828826904297, + "DoNothing": 7.462501525878906e-05, + "DramToDramTranspose": 0.019293546676635742, + "DumpGraphAndMetadata": 0.10360383987426758, + "EliminateDivs": 0.003831148147583008, + "ExpandBatchNorm": 0.0019576549530029297, + "ExpandISAMacro": 0.011517524719238281, + "FactorizeBlkDims": 0.008472919464111328, + "FactorizeThreadAxesInFreeDims": 0.001847982406616211, + "FlattenMacroLoop": 0.003529787063598633, + "GenericAccessSimplifier": 0.0008223056793212891, + "InferInitValue": 0.025947093963623047, + "InferIntrinsicOnCC": 0.00908350944519043, + "InferNeuronTensor": 0.02371978759765625, + "InferNonlocalTensors": 0.014753341674804688, + "InferPSumTensor": 0.3085360527038574, + "InlineNativeKernels": 0.008690595626831055, + "InsertIOTransposes": 0.01906275749206543, + "InsertLocalTransposes": 0.004312276840209961, + "InsertOffloadedTransposes": 0.002802133560180664, + "LICM": 0.003081083297729492, + "LateLegalizeInst": 0.01367807388305664, + "LateLegalizePostSplit": 0.012533903121948242, + "LateLowerReshapeOp": 0.001035451889038086, + "LateLowerTensorOp": 0.002605438232421875, + "LateNeuronInstComb": 0.008839130401611328, + "LayoutPreprocessing": 0.03434133529663086, + "LayoutPreprocessingAndAnalysis": 0.07319903373718262, + "LayoutRequirementAnalysis": 0.005194187164306641, + "LegalizeCCOpLayout": 0.0025322437286376953, + "LegalizeOpLevelAlias": 0.0020308494567871094, + "LegalizePartitionReduce": 0.0010001659393310547, + "LegalizeSundaAccess": 0.07694768905639648, + "LegalizeSundaMacro": 0.011176109313964844, + "LegalizeType": 0.014355182647705078, + "LocalLayoutOpt": 0.014019250869750977, + "LoopFusion": 0.005472898483276367, + "LoopSplitting": 0.00038623809814453125, + "LowerBroadcast": 0.0025022029876708984, + "LowerCCOpBlockAxis": 0.0058476924896240234, + "LowerComplexBroadcast": 0.00213623046875, + "LowerIntrinsics": 0.30684900283813477, + "LowerTensorOp": 0.010679960250854492, + "LowerTranspose": 0.012311697006225586, + "MacroGeneration": 0.029733657836914063, + "MaskPropagation": 0.0028328895568847656, + "MemcpyElimination": 0.026583433151245117, + "MutateDataType": 0.0020093917846679688, + "NeuronAliasDependencyInduction": 0.00018548965454101563, + "NeuronAliasDependencyReset": 0.02524423599243164, + "NeuronInstComb": 0.003789663314819336, + "NeuronLICM": 0.03511476516723633, + "NeuronLoopFusion": 0.007987260818481445, + "NeuronLoopInterchange": 0.0023233890533447266, + "NeuronSimplifier": 0.0075054168701171875, + "NeuronSimplifyPredicates": 0.11913681030273438, + "NeuronValueNumbering": 0.0033991336822509766, + "OptimizeAliasedCopyChain": 0.0005936622619628906, + "OptimizeNKIKernels": 0.44962644577026367, + "PAGLayoutOpt": 0.0999138355255127, + "PComputeCutting": 0.005170106887817383, + "PGLayoutTilingPipeline": 0.7408750057220459, + "PGTiling": 0.29245758056640625, + "PadElimination": 0.000308990478515625, + "ParAxesAnnotation": 0.05283546447753906, + "PartialLoopFusion": 0.0043125152587890625, + "PartialSimdFusion": 0.004901885986328125, + "PerfectLoopNest": 0.001722574234008789, + "RecognizeOpIdiom": 0.004076480865478516, + "Recompute": 0.0002620220184326172, + "RelaxPredicates": 0.013286828994750977, + "Rematerialization": 0.0021238327026367188, + "ReshapeWeights": 0.0006799697875976563, + "ResolveAccessConflict": 0.0040090084075927734, + "ResolveComplicatePredicates": 0.001981496810913086, + "RewriteReplicationMatmul": 0.0021796226501464844, + "RewriteWeights": 0.0022602081298828125, + "SFKVectorizer": 0.27124762535095215, + "SimpleAllReduceTiling": 0.00896596908569336, + "Simplifier": 0.0046122074127197266, + "SimplifyMacroPredicates": 0.010458230972290039, + "SimplifyNeuronTensor": 1.0512049198150635, + "SimplifySlice": 0.0009145736694335938, + "SimplifyTensor": 0.00577855110168457, + "SpillPSum": 0.012126922607421875, + "SplitAPUnionSets": 0.10518908500671387, + "SplitAccGrp": 0.001172780990600586, + "StaticProfiler": 0.0124053955078125, + "StaticTransposeLocalTensor": 0.0038576126098632813, + "SundaISel": 0.04396390914916992, + "TCTransform": 0.0018804073333740234, + "TensorInitialization": 0.012793779373168945, + "TensorOpSimplifier": 0.0045316219329833984, + "TensorOpTransform": 0.021115541458129883, + "TileCCOps": 0.0056231021881103516, + "TilingProfiler": 0.00790858268737793, + "TransformConvOp": 0.0030431747436523438, + "TritiumFusion": 0.03186154365539551, + "ValueNumbering": 0.0038623809814453125, + "VectorizeDMA": 0.0021522045135498047, + "VectorizeMatMult": 0.003453969955444336, + "WeightCoalescing": 0.009035825729370117, + "ZeroSizeTensorElimination": 0.00011420249938964844 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 43318.0, + "StaticProfiler::AifUb": 154.8094024658203, + "StaticProfiler::ArithmeticIntensityTensorizer": 152.92723083496094, + "StaticProfiler::AverageDmaLength": 4809.89794921875, + "StaticProfiler::AverageFractalPeUtilization": 99.65782165527344, + "StaticProfiler::AveragePartitionUtilization": 97.58238220214844, + "StaticProfiler::AveragePeUtilization": 98.61824035644531, + "StaticProfiler::DDRTransferBytes": 787141440.0, + "StaticProfiler::InternalTransferBytes": 634853888.0, + "StaticProfiler::LoadExpanded": 98070.0, + "StaticProfiler::LocalizationEfficiency": 98.78419494628906, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 100.47209167480469, + "StaticProfiler::StoreExpanded": 2397.0, + "StaticProfiler::TotalDMAExpanded": 100467.0, + "StaticProfiler::TotalDynamicInstancesCount": 50670.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 50224.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 22848.0, + "TilingProfiler::NumPfTransposes": 5.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 19201.0, + "TilingProfiler::PfTransposeInstructionsForIo": 19008.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 192.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 4.0, + "TilingProfiler::SimdInstructionsAfterTiling": 158.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg01": { + "compiletime": { + "CanonicalizeConv": 1.9999999949504854e-06, + "CanonicalizeForTensorizer": 1.4999999621068127e-05, + "Canonicalizer": 0.0002589999930933118, + "HoistCompute": 4.999999873689376e-06, + "IdentifyCrossPassTensors": 7.999999979801942e-06, + "MemcastMotion": 7.999999979801942e-06, + "PenguinizeFunctions": 1.5999999959603883e-05, + "PruneFunctions": 2.099999983329326e-05, + "RemoveOptimizationBarriers": 7.999999979801942e-06, + "ScatterMotion": 3.7999998312443495e-05, + "TensorizerLegalizationPass": 1.9999999494757503e-05, + "VerifySupportedOps": 9.999999747378752e-06, + "algsimp": 6.199999916134402e-05, + "batchnorm_expander": 1.2999999853491317e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 9.999999747378752e-06, + "canonicalize-boundary-marker": 6.000000212225132e-06, + "collective-stream-id-checker": 3.999999989900971e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 2.4000000848900527e-05, + "conditional-to-select": 4.999999873689376e-06, + "config-lowering": 2.099999983329326e-05, + "constant_folding": 7.999999979801942e-06, + "cse": 1.2000000424450263e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.999999989900971e-06, + "emit-offloaded-dropout": 1.4000000192027073e-05, + "flatten-call-graph": 9.000000318337698e-06, + "fuse-send-recv": 2.9000000722589903e-05, + "hilo::LegalizeAlias": 4.999999873689376e-06, + "hilo::NeuronInstCombine": 4.5000000682193786e-05, + "hilo::NeuronOpFusion": 1.5999999959603883e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 1.1000000085914508e-05, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 1.4999999621068127e-05, + "hilo::VerifyAliasing": 1.9999999949504854e-06, + "hlo-mac-count": 2.5999999706982635e-05, + "hlo-verifier": 0.00020500000391621143, + "legalize-ccops": 9.999999974752427e-07, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.999999989900971e-06, + "map-inline": 1.2000000424450263e-05, + "metadata-naming": 2.499999936844688e-05, + "mlir::detail::OpToOpPassAdaptor": 0.00012799999967683107, + "mlir::hlo::MhloToPyPenguin": 0.0009619999909773469, + "mlir::mhlo::LowerComplexExtraPass": 8.099999831756577e-05, + "mlir::mhlo::LowerComplexPass": 3.999999989900971e-06, + "native-to-custom-softmax": 6.000000212225132e-06, + "native-to-custom-softmax-dx": 1.5999999959603883e-05, + "operand_upcaster": 2.099999983329326e-05, + "post-par-pipe-begin": 4.999999873689376e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0005779999773949385, + "replace-minimum-constant": 6.000000212225132e-06, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 4.8999998398358e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 9.000000318337698e-06, + "tuple-simplifier": 4.999999873689376e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 240.22828674316406, + "HloMacCount": 24964497408.0, + "Traffic": 207839776.0 + } + }, + "sg02": { + "compiletime": { + "CanonicalizeConv": 0.0, + "CanonicalizeForTensorizer": 1.2999999853491317e-05, + "Canonicalizer": 0.000311999989207834, + "HoistCompute": 0.0, + "IdentifyCrossPassTensors": 1.2000000424450263e-05, + "MemcastMotion": 0.0, + "PenguinizeFunctions": 1.1000000085914508e-05, + "PruneFunctions": 7.999999979801942e-06, + "RemoveOptimizationBarriers": 1.2000000424450263e-05, + "ScatterMotion": 0.0, + "TensorizerLegalizationPass": 7.000000096013537e-06, + "VerifySupportedOps": 1.1000000085914508e-05, + "algsimp": 5.999999848427251e-05, + "batchnorm_expander": 1.2999999853491317e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 1.1000000085914508e-05, + "canonicalize-boundary-marker": 6.000000212225132e-06, + "collective-stream-id-checker": 3.000000106112566e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 2.499999936844688e-05, + "conditional-to-select": 7.000000096013537e-06, + "config-lowering": 2.5999999706982635e-05, + "constant_folding": 9.000000318337698e-06, + "cse": 1.2000000424450263e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.999999989900971e-06, + "emit-offloaded-dropout": 1.4000000192027073e-05, + "flatten-call-graph": 1.2000000424450263e-05, + "fuse-send-recv": 1.8000000636675395e-05, + "hilo::LegalizeAlias": 1.9999999949504854e-06, + "hilo::NeuronInstCombine": 1.2999999853491317e-05, + "hilo::NeuronOpFusion": 0.0, + "hilo::ReplaceTokenTypeWithU8Pass": 1.5999999959603883e-05, + "hilo::ScheduleFusion": 0.0, + "hilo::SixtyFourHack": 5.900000178371556e-05, + "hilo::VerifyAliasing": 9.999999974752427e-07, + "hlo-mac-count": 0.0001720000000204891, + "hlo-verifier": 0.0001880000054370612, + "legalize-ccops": 9.999999974752427e-07, + "legalize-compare": 3.000000106112566e-06, + "lower-argminmax-custom-call": 3.999999989900971e-06, + "map-inline": 1.2000000424450263e-05, + "metadata-naming": 1.4999999621068127e-05, + "mlir::detail::OpToOpPassAdaptor": 1.9999999494757503e-05, + "mlir::hlo::MhloToPyPenguin": 0.0008660000166855752, + "mlir::mhlo::LowerComplexExtraPass": 9.100000170292333e-05, + "mlir::mhlo::LowerComplexPass": 0.00010599999950500205, + "native-to-custom-softmax": 6.000000212225132e-06, + "native-to-custom-softmax-dx": 2.300000051036477e-05, + "operand_upcaster": 2.300000051036477e-05, + "post-par-pipe-begin": 1.9999999949504854e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0005329999839887023, + "replace-minimum-constant": 9.000000318337698e-06, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 4.400000034365803e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 6.0999998822808266e-05, + "tuple-simplifier": 4.999999873689376e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 50.378170013427734, + "HloMacCount": 19638517760.0, + "Traffic": 779643968.0 + } + } +} \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk1/graph.neff b/context_encoding_model/_tp0_bk1/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..2de3aaa470a89d9b3fd1549f8824d3ae3c51d59e --- /dev/null +++ b/context_encoding_model/_tp0_bk1/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60a8e4c285a690a146d149c675038f0498f62f761e4e3893706941d7ca8af583 +size 1659904 diff --git a/context_encoding_model/_tp0_bk1/log-neuron-cc.txt b/context_encoding_model/_tp0_bk1/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..607f3d9ae303d3f6e9f0df8acc802d1e33954391 --- /dev/null +++ b/context_encoding_model/_tp0_bk1/log-neuron-cc.txt @@ -0,0 +1,5046 @@ +2025-08-07T13:53:50Z INFO 47579 [root]: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb --output /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=/home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/log-neuron-cc.txt --verbose=35 +2025-08-07T13:53:50Z INFO 47579 [root]: NeuronX Compiler version 2.20.9961.0+0acef03a Python version 3.10.12 HWM version 2.20.0.9961+0acef03a NumPy version 1.26.4 Running on AMI ami-040348201d80b58ad Running in region usw2-az4 +2025-08-07T13:53:50Z INFO 47841 [root]: XLA detected +2025-08-07T13:53:50Z INFO 47841 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-08-07T13:53:51Z INFO 47841 [root]: Intermediate files stored in /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/neuronxcc-k7bxqych, output in /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1 +2025-08-07T13:53:51Z INFO 47841 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-08-07T13:53:51Z INFO 47841 [pipeline.Pipeline.0]: Processing input #0 +2025-08-07T13:53:51Z INFO 47841 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-08-07T13:53:51Z INFO 47841 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-08-07T13:53:51Z INFO 47841 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-08-07T13:53:51Z INFO 47841 [job.HLOToTensorizer.0]: Processing input #0 +2025-08-07T13:53:51Z INFO 47841 [job.HLOToTensorizer.0]: IR signature: b03debb723d63387ea26771f63729d616ac71a0dbfcb78d21d2194ff723fcbc1 for model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb +2025-08-07T13:53:51Z INFO 47841 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --layers-per-module=1 --partition --emit-tensor-level-dropout-ops --modular-flow-mac-threshold=10 --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-08-07T13:53:51Z INFO 47841 [job.HLOToTensorizer.0]: DEBUG: needsModular_PreSplit? Yes. macCnt 899033088000 threshold 4398046511104 num non-trivial Ops 3871 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 38 + +Pre-Partition Pre-Opt Histogram: +total HLO instructions: 10617 + reshape 2091 19.69% ################################################################ + broadcast 1731 16.30% #################################################### + convert 1281 12.07% ####################################### + transpose 1268 11.94% ###################################### + constant 815 7.68% ######################## + parameter 475 4.47% ############## + slice 445 4.19% ############# + add 365 3.44% ########### + multiply 327 3.08% ########## + dot 326 3.07% ######### + get-tuple-element 295 2.78% ######### + select 255 2.40% ####### + compare 222 2.09% ###### + call 186 1.75% ##### + concatenate 148 1.39% #### + tuple 73 0.69% ## + scatter 73 0.69% ## + negate 72 0.68% ## + all-reduce 72 0.68% ## + custom-call 38 0.36% # + divide 37 0.35% # + iota 7 0.07% + gather 6 0.06% + all-gather 3 0.03% + reduce 3 0.03% + sine 1 0.01% + cosine 1 0.01% + maximum 1 0.01% + +INFO: IoStatistics: total inputs: 475 +INFO: IoStatistics: total outputs: 73 +INFO: IoStatistics: total passthrough tensors: 0 +INFO: IoStatistics: total outputs read from: 0 +INFO: IoStatistics: total redundant outputs: 0 +INFO: IoStatistics: total ifmap size (KiB): 8072796 +INFO: IoStatistics: total ofmap size (KiB): 73728 +INFO: IoStatistics: total must-alias size (KiB): 73728 +INFO: IoStatistics: total may-alias size (KiB): 0 +INFO: HloMacCount has found 899033071616 +INFO: Traffic has found 8421286061 +INFO: AIF 213.51 + +Pre-Partition Post-Op Histogram: +total HLO instructions: 6623 + reshape 1424 21.50% ################################################################ + convert 992 14.98% ############################################ + transpose 941 14.21% ########################################## + constant 523 7.90% ####################### + parameter 475 7.17% ##################### + broadcast 410 6.19% ################## + dot 325 4.91% ############## + custom-call 223 3.37% ########## + multiply 219 3.31% ######### + add 219 3.31% ######### + get-tuple-element 151 2.28% ###### + slice 147 2.22% ###### + concatenate 146 2.20% ###### + select 110 1.66% #### + compare 76 1.15% ### + scatter 73 1.10% ### + negate 72 1.09% ### + all-reduce 72 1.09% ### + gather 6 0.09% + iota 5 0.08% + all-gather 3 0.05% + reduce 3 0.05% + pad 2 0.03% + sine 1 0.02% + divide 1 0.02% + tuple 1 0.02% + maximum 1 0.02% + rng 1 0.02% + cosine 1 0.02% + +INFO: Found memory bound graph +DEBUG: needsModular_PreSplit? Yes. macCnt 899033071616 threshold 4398046511104 num non-trivial Ops 2702 +DEBUG: transformer model +INFO: Partitioner configs:ModularFlow BO LBL SA ConcatGraphs: 1 MaxDisj:2 MaxSep:4 LPM:1 +INFO: Markers NOT detected +Potential split-points stats: #CC 75 #AR 72 #AG 3 #BN 0 nClamp 0 +DEBUG: needsModular_SplitFinder? Yes. +ModuleSplitter initial partitioning... #parts 75 +ModuleSplitter initial partitioning... Done. +INFO: Num of unique Module Definitions: 6 +DEBUG: DefMap: 0 1 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 73 74 +New disjoint wave: start 2 len 70 NumReps: 35 macs 873757409280 +INFO: Attempting to identify and split optimizer at end +First non-zero-mac/used part from the end is 73 +Not enough zero-mac parts. skip +INFO: Optimized 0 all-reduce split instructions +INFO: Number of splitPoints: 37 +ModuleSplitter initial partitioning... #parts 37 +ModuleSplitter initial partitioning... Done. +Remat: gather-iota 0 matches, 0 ops rematted +INFO: Alias legality verification of partitions PASSED. +INFO: No transposable_weight_idx attrs found +INFO: Peak intermediate memory demand is at Partition 1. Num live intermediates at peak is 9 and memory usage is 8585220 bytes. +INFO: Please refer to LiveRangeReport_PostHloPart.txt for detailed intermediate lifetime info. +DEBUG: DefMap: 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 36 +Wrote HLO netlist to hlo_netlist.json +Wrote graph partitions in debug_info_hlo_partitions.json +Processing partition 0 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 0 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 5637144576 +INFO: Traffic has found 676023078 +INFO: AIF 16.68 +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call dot gather get-tuple-element iota multiply negate parameter reshape scatter select sine slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 1 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 0 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 24964497408 +INFO: Traffic has found 207839780 +INFO: AIF 240.23 +HLO Ops used in computation: add all-reduce broadcast compare concatenate constant convert custom-call dot get-tuple-element multiply negate parameter reshape scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 2 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 0 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 19638517760 +INFO: Traffic has found 779643979 +INFO: AIF 50.38 +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert custom-call divide dot gather get-tuple-element iota maximum multiply pad parameter reduce reshape rng scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass + +2025-08-07T13:53:51Z INFO 47841 [job.HLOToTensorizer.0]: IR signature: bf2e39aa7771e8cc2cf4df2c8e7022ed3e5c9e3ea93743f2ec86ac80e2cdc7a9 for sg0000/HLOToTensorizer +2025-08-07T13:53:51Z INFO 47841 [job.HLOToTensorizer.0]: IR signature: c324833c64b1c3b76a3850355e3f07ded481aafa40790c4a4d2a3de65e7d3b12 for sg0001/HLOToTensorizer +2025-08-07T13:53:51Z INFO 47841 [job.HLOToTensorizer.0]: IR signature: b6c0487545358e52926e0244748a4c5f3f04127377e8886d466c1c81530961f6 for sg0002/HLOToTensorizer +2025-08-07T13:53:51Z INFO 47841 [job.HLOToTensorizer.0]: Job #0 finished +2025-08-07T13:53:51Z INFO 47841 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-08-07T13:53:51Z INFO 47841 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-08-07T13:53:51Z INFO 47841 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-08-07T13:53:51Z INFO 47841 [job.Frontend.0]: Processing input #0 +2025-08-07T13:53:51Z INFO 47841 [job.Frontend.0]: Start model loading +2025-08-07T13:53:51Z INFO 47841 [job.Frontend.0]: Start tensorization +2025-08-07T13:53:52Z INFO 47841 [job.Frontend.0]: Num jobs: 128 +2025-08-07T13:53:52Z USER 47841 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-08-07T13:53:52Z INFO 47841 [Tensorizer]: Max workers: 3 +2025-08-07T13:53:52Z INFO 49107 [Tensorizer]: Building model from Penguin script "penguin.py.000000"... +2025-08-07T13:53:52Z INFO 49109 [Tensorizer]: Building model from Penguin script "penguin.py.000002"... +2025-08-07T13:53:52Z INFO 49108 [Tensorizer]: Building model from Penguin script "penguin.py.000001"... +2025-08-07T13:53:52Z INFO 49108 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-08-07T13:53:52Z INFO 49107 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-08-07T13:53:52Z INFO 49109 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.011 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.005 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.023 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.005 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.013 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/LegalizeCCOpLayout]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.007 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.070 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.007 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.005 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.017 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.004 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.013 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.004 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.029 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.005 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.009 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.034 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.021 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/LateLowerTensorOp]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.006 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.028 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.012 seconds +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.027 seconds +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.112 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:52Z INFO 49109 [sg0002/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.007 seconds +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.051 seconds +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49108 [sg0001/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.030 seconds +2025-08-07T13:53:52Z INFO 49107 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.007 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.016 seconds +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/Rematerialization]: Running Rematerialization +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/Rematerialization]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/Rematerialization]: Rematerialization finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.013 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.008 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.015 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.040 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.034 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.007 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.008 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.077 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.010 seconds +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.006 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.106 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.029 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Rematerialization]: Running Rematerialization +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Rematerialization]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Rematerialization]: Rematerialization finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.009 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.012 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/TCTransform]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.043 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.031 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:53Z INFO 49107 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.010 seconds +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.007 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.019 seconds +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/PadElimination]: Running PadElimination +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/PadElimination]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/PadElimination]: PadElimination finished after 0.000 seconds +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 49108 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 49109 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49107 [Tensorizer]: After optimization: 26 statements +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=2097152 is not above min_allgather_tile_size_in_bytes=8388608` +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/TileCCOps]: in bfloat16 (4096, 256) %'all_gather.1' = AllGatherOp-46 AllGather_add(bfloat16 (2048, 256) %'transpose.1', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((4096, 256), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.47 | hlo_id: 19 | , id = 46 +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/TileCCOps]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.006 seconds +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.008 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.011 seconds +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.008 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.006 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/PadElimination]: Running PadElimination +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/PadElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/PadElimination]: PadElimination finished after 0.000 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.008 seconds +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.035 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/Recompute]: Running Recompute +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/Recompute]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49108 [Tensorizer]: After optimization: 25 statements +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/MutateDataType]: Running MutateDataType +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/MutateDataType]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/MutateDataType]: MutateDataType finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.014 seconds +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/TileCCOps]: Running TileCCOps +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/TileCCOps]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.006 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/Recompute]: Running Recompute +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/Recompute]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.018 seconds +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/TileCCOps]: TileCCOps finished after 0.006 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.008 seconds +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.015 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49109 [Tensorizer]: After optimization: 38 statements +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/MutateDataType]: Running MutateDataType +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/MutateDataType]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/MutateDataType]: MutateDataType finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/TileCCOps]: Running TileCCOps +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/TileCCOps]: in float32 (512,) %'all_gather.2' = AllGatherOp-149 AllGather_add(float32 (256,) %'add.11', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.8843 | hlo_id: 101 | , id = 149 +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=2048 is not above min_allgather_tile_size_in_bytes=8388608` +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/TileCCOps]: in uint32 (512,) %'all_gather.3' = AllGatherOp-165 AllGather_add(uint32 (256,) %'add.12', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.8978 | hlo_id: 110 | , id = 165 +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/TileCCOps]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/TileCCOps]: TileCCOps finished after 0.006 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.007 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.010 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-08-07T13:53:54Z INFO 49108 [sg0001/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-08-07T13:53:54Z INFO 49109 [sg0002/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.032 seconds +2025-08-07T13:53:54Z INFO 49107 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.007 seconds +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.075 seconds +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.001 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.010 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.009 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.106 seconds +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.014 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.024 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.011 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.034 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.073 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.015 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.053 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.037 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.100 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.008 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.109 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.297 seconds +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.006 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.008 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.032 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.006 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/PGTiling]: Running PGTiling +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.332 seconds +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 585 of IO tensor {'CrossPassTensor': ''}bfloat16 %input471|NC|(128, 32) is not sorted, index list (w/ AG ids): [(16, 'AG49'), (13, 'AG50')] +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 586 of IO tensor {'CrossPassTensor': ''}bfloat16 %input472|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(16, 'AG49'), (13, 'AG50')] +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 587 of IO tensor {'CrossPassTensor': ''}bfloat16 %input470|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(16, 'AG49'), (13, 'AG50')] +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 588 of IO tensor {'CrossPassTensor': ''}bfloat16 %input469(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(10, 'AG54'), (15, 'AG52'), (11, 'AG53')] +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 589 of IO tensor {'CrossPassTensor': ''}bfloat16 %input474|NC|(128, 32) is not sorted, index list (w/ AG ids): [(16, 'AG49'), (13, 'AG50')] +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 540 of IO tensor {'CrossPassTensor': ''}bfloat16 %input473|NC|(75968, 32, 128) is not sorted, index list (w/ AG ids): [(14, 'AG59'), (13, 'AG50')] +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.020 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.095 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.007 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.144 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 702 of IO tensor {'CrossPassTensor': ''}bfloat16 %input79|NC|(128, 32) is not sorted, index list (w/ AG ids): [(16, 'AG82'), (15, 'AG83')] +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 703 of IO tensor {'CrossPassTensor': ''}bfloat16 %input83(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(16, 'AG82'), (15, 'AG83')] +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 704 of IO tensor {'CrossPassTensor': ''}bfloat16 %input81(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(16, 'AG82'), (15, 'AG83')] +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 705 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(16, 'AG82'), (15, 'AG83')] +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 459 of IO tensor {'CrossPassTensor': ''}bfloat16 %input77(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(9, 'AG94'), (6, 'AG90'), (7, 'AG89'), (11, 'AG93'), (13, 'AG92')] +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 698 of IO tensor non_local bfloat16 %all_gather.1(32, 128, 256) is not sorted, index list (w/ AG ids): [(16, 'AG82'), (8, 'AG84')] +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.000 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.006 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/PGTiling]: Running PGTiling +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.082 seconds +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 669 of IO tensor {'CrossPassTensor': ''}bfloat16 %input86|NC|(128, 32) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 670 of IO tensor {'CrossPassTensor': ''}bfloat16 %input87|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 671 of IO tensor {'CrossPassTensor': ''}bfloat16 %input85|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 672 of IO tensor {'CrossPassTensor': ''}bfloat16 %input84(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(7, 'AG90'), (14, 'AG88'), (8, 'AG89')] +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 673 of IO tensor {'CrossPassTensor': ''}bfloat16 %input90|NC|(128, 32) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 674 of IO tensor {'CrossPassTensor': ''}bfloat16 %input94(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 679 of IO tensor {'CrossPassTensor': ''}bfloat16 %input92(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 680 of IO tensor {'CrossPassTensor': ''}bfloat16 %input89|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 447 of IO tensor {'CrossPassTensor': ''}bfloat16 %input88(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(2, 'AG100'), (0, 'AG96'), (1, 'AG95'), (3, 'AG99'), (4, 'AG98')] +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.036 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.008 seconds +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.030 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/PGTiling]: PGTiling finished after 0.292 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.000 seconds +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.019 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.006 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.007 seconds +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-08-07T13:53:55Z INFO 49108 [sg0001/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.019 seconds +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 0.741 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 19008: transpose_128x128 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 19008: matmul_128x128x1 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x256 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x256 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 768: matmul_128x128x512 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 48: simd128x256 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 32: rmsnorm128x256x128 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 32: simd128x256 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 32: rmsnorm128x256x128 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 2: indirect_load128x1 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x1 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x1 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x1 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 1: reduce256x1x1 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x256 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 1: reduce256x1x1 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingBottleneck]: 1: indirect_load32x128 +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.072 seconds +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 0.341 seconds +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-08-07T13:53:55Z INFO 49107 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.008 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49109 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.012 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.024 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.005 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.033 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 1.329 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 512: matmul_128x128x256 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 128: matmul_128x128x256 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 64: simd32x512 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 64: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 32: rmsnorm128x256x128 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 32: matmul_128x128x256 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 16: softmax256x2x128 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 16: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 8: indirect_load128x512 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 8: rmsnorm128x512x128 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 8: simd128x256 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 8: simd128x256 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 8: simd128x512 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 8: transpose_128x128 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingBottleneck]: 8: simd128x64 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.003 seconds +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.000 seconds +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.003 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.014 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.006 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.014 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.119 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/PGTiling]: PGTiling finished after 0.518 seconds +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.067 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 19008: transpose_128x128 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 19008: matmul_128x128x1 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x256 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x256 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 768: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 594: transpose_128x1 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 384: dma128x512 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 48: dma128x4096 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 48: dma128x4096 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 48: simd128x256 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x256x128 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: simd128x256 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x256x128 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 8: dma128x1024 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 2: indirect_load128x1 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 1: dma128x32 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 1: simd1x1 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.022 seconds +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.003 seconds +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.006 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.047 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.011 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.008 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.008 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.005 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/LICM]: LICM finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.040 seconds +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 1.242 seconds +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x256 +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x256 +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 768: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 512: matmul_128x128x256 +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 128: matmul_128x128x256 +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 64: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 48: simd128x256 +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 32: rmsnorm128x256x128 +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 32: simd128x256 +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 32: rmsnorm128x256x128 +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 32: matmul_128x128x256 +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 32: softmax256x1x128 +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 16: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 8: rmsnorm128x512x128 +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingBottleneck]: 8: simd64x512 +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.010 seconds +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/InferInitValue]: Running InferInitValue +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.014 seconds +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/InferInitValue]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/InferInitValue]: InferInitValue finished after 0.026 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.015 seconds +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.008 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.008 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/SimplifyTensor]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.011 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.006 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/SundaISel]: Running SundaISel +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.052 seconds +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.009 seconds +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/SundaISel]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/SundaISel]: SundaISel finished after 0.044 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.000 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.025 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.118 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x256 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: dma32x128 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x256 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: dma128x512 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: dma32x512 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: simd32x512 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x256 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x256x128 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: matmul_128x128x256 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x512 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: dma128x4096 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: softmax256x2x128 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 8: indirect_load128x512 +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.011 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.009 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.006 seconds +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-08-07T13:53:56Z INFO 49109 [sg0002/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.011 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.003 seconds +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.008 seconds +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49108 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.010 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.003 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.013 seconds +2025-08-07T13:53:56Z INFO 49107 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 0.032 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.008 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.012 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.006 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.005 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.008 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 0.046 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.137 seconds +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x256 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x256 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 768: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x256 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 384: dma128x512 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x256 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: dma128x512 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 48: dma128x4096 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 48: dma128x4096 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 48: simd128x256 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x256x128 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: simd128x256 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x256x128 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: matmul_128x128x256 +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.027 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.001 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/DeConcat]: Running DeConcat +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/DeConcat]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/DeConcat]: DeConcat finished after 0.001 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.000 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.150 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.005 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.032 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.008 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/PartialLoopFusion]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.005 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.012 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.019 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.009 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.013 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.001 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/SpillPSum]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.020 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/SpillPSum]: SpillPSum finished after 0.012 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.005 seconds +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.058 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.012 seconds +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.012 seconds +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.016 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.013 seconds +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.006 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.011 seconds +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.001 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.001 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.008 seconds +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/InferInitValue]: Running InferInitValue +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.030 seconds +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/InferInitValue]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/InferInitValue]: InferInitValue finished after 0.038 seconds +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:57Z INFO 49107 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.307 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.013 seconds +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/SimplifyTensor]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.009 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/LegalizeType]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.006 seconds +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/LegalizeType]: LegalizeType finished after 0.014 seconds +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/LICM]: LICM finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 49108 [sg0001/Tensorizer/SundaISel]: Running SundaISel +2025-08-07T13:53:57Z INFO 49109 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.038 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/SundaISel]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.035 seconds +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.016 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.004 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/SundaISel]: SundaISel finished after 0.043 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.000 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.023 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.024 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.008 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.023 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.012 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.024 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.007 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.019 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.013 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.309 seconds +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.034 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.005 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.048 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.011 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.005 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.015 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.009 seconds +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.048 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/DeConcat]: Running DeConcat +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/DeConcat]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/DeConcat]: DeConcat finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.043 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.005 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.027 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.015 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.077 seconds +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.006 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.013 seconds +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.004 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.054 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.011 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.013 seconds +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/DataStreaming]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.005 seconds +2025-08-07T13:53:58Z INFO 49107 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.017 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.006 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.019 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.004 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.119 seconds +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.011 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.001 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.012 seconds +2025-08-07T13:53:58Z INFO 49109 [sg0002/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.021 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/SpillPSum]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/SpillPSum]: SpillPSum finished after 0.017 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.036 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:53:58Z INFO 49108 [sg0001/Tensorizer/LegalizeType]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/LegalizeType]: LegalizeType finished after 0.005 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.008 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.032 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/WeightCoalescing]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.003 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.330 seconds +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.016 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.003 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.007 seconds +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.003 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.007 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.001 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DataStreaming]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DataStreaming]: DataStreaming finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.005 seconds +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 74.128us (16.000MiB, est bw: 226.329GB/s, 24.258% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[4] bfloat16 (2, 2, 2, 2, 128, 4096) %'input83_local_1206'[i48_0_0_1477,i48_0_1_1477,i35_1_0_1477,i32_0_0_1_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (4, 2, 2, 128, 4096) %'input83'[2i48_0_0_1477+i48_0_1_1477,i35_1_0_1477,i32_0_0_1_1,i0.128,i1.4096] # id=1358, src_id=None, , instances=16 # dl = tensor_op_name: _dot.2 | hlo_id: 34 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 74.128us (16.000MiB, est bw: 226.329GB/s, 24.258% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (4, 2, 128, 16, 512) %'input77_local_1245'[i122_0_0_0_1251_0_1478,i122_0_0_0_1,i0.128,i3.16,i1.128+256p_1777+128i2.2] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 16, 2, 128) %'input77'[2i122_0_0_0_1251_0_1478+i122_0_0_0_1,p_1777,i0.128,i3.16,i2.2,i1.128] # id=1462, src_id=None, , instances=16 # dl = tensor_op_name: _dot.3 | hlo_id: 145 | [[i0.128];[i1.128, i2.2, i3.16]] -> [[i0.128];[i1.128, i2.2, i3.16]] +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 26.222us (2.000MiB, est bw: 79.978GB/s, 8.581% of tot. time) for bfloat16<128 x 256> TongaSB partitions[1] bfloat16 (32, 128, 256) %'custom-call.226.1555'[i29_0_1191,i0.128,i1.256] = load bfloat16<128 x 256> non_local bfloat16 (32, 128, 256) %'all_gather.1'[i29_0_1191,i0.128,i1.256] # id=1353, src_id=None, , instances=32 # dl = tensor_op_name: _custom-call.226 | hlo_id: 27 | [[i0.128];[i1.256]] -> [[i0.128];[i1.256]] +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 21.519us (2.000MiB, est bw: 97.454GB/s, 7.042% of tot. time) for bfloat16<32 x 4096> TongaSB partitions[1] bfloat16 (8, 32, 8, 128, 4) %'all_gather.1_local_1158'[c0_1152,i0.32,i2.8,i3.128,i1.4] = load bfloat16<32 x 4096> non_local bfloat16 (32, 128, 256) %'all_gather.1'[i0.32,i3.128,32c0_1152+i1.4+4i2.8] # id=1316, src_id=None, , instances=8 # dl = tensor_op_name: UnnamedModule | hlo_id: 1 | [[i0.32];[i1.4, i2.8, i3.128]] -> [[i0.32];[i1.4, i2.8, i3.128]] +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 6.383% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[0] bfloat16 (128, 32, 512) %'input78_local_1233'[i0.128,i2.32,128p_1692+i1.128] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (4, 128, 32, 128) %'input78'[p_1692,i0.128,i2.32,i1.128] # id=1457, src_id=None, , instances=4 # dl = tensor_op_name: _dot | hlo_id: 131 | [[i0.128];[i1.128, i2.32]] -> [[i0.128];[i1.128, i2.32]] +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 6.383% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 2, 128, 4096) %'input81_local_1220'[i120_0_0_2115,i59_0_0_1_2114_2115,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input81'[i120_0_0_2115,i59_0_0_1_2114_2115,i0.128,i1.4096] # id=1403, src_id=None, , instances=4 # dl = tensor_op_name: _dot.1 | hlo_id: 82 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 13.374us (2.000MiB, est bw: 156.803GB/s, 4.377% of tot. time) for bfloat16<32 x 4096> {'IntermediateTensor': ''}bfloat16 (256, 32, 128) %'intermediate1'(init=0.0)[32i0_0_0_1164+i2.32,i0.32,i1.128] = store bfloat16<32 x 4096> TongaSB partitions[1] bfloat16 (8, 32, 32, 128) %'UnnamedModule.1785'[i0_0_0_1164,i0.32,i2.32,i1.128] # id=1318, src_id=None, , instances=8 # dl = tensor_op_name: UnnamedModule | hlo_id: 1 | [[i0.32];[i1.128, i2.32]] -> [[i0.32];[i1.128, i2.32]] +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 8.725us (1.000MiB, est bw: 120.176GB/s, 2.855% of tot. time) for bfloat16<128 x 256> non_local bfloat16 (4, 4, 128, 256) %'transpose.1'[T_i12_0_1120,2T_i12_1_1120_0_0+T_i12_1_1120_0_1_2123_2124,i0.128,i1.256] = store bfloat16<128 x 256> TongaSB partitions[2] bfloat16 (4, 2, 128, 512) %'1116.1783'[T_i12_0_1120,T_i12_1_1120_0_0,i0.128,i1.256+256T_i12_1_1120_0_1_2123_2124] # id=1496, src_id=None, , instances=16 # dl = tensor_op_name: transpose.1_pftranspose_1116 | hlo_id: 16 | [[i0.128];[i1.256]] -> [[i0.128];[i1.256]] +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 8.606us (2.000MiB, est bw: 243.684GB/s, 2.816% of tot. time) for bfloat16<128 x 1024> non_local bfloat16 (1048576,) %'dot.4-buffer-2167'[1024i122_0_0_0_1251_0_1478+4096i0.128+524288i123_0_1251_1478+i1.1024] = store bfloat16<128 x 1024> TongaSB partitions[2] bfloat16 (4, 2, 128, 1024) %1252[i122_0_0_0_1251_0_1478,i123_0_1251_1478,i0.128,i1.1024] # id=1465, src_id=None, , instances=8 # dl = tensor_op_name: _dot.3 | hlo_id: 145 | [[i0.128];[i1.1024]] -> [[i0.128];[i1.1024]] +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 6.372us (1.000MiB, est bw: 164.552GB/s, 2.085% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[1] bfloat16 (2, 128, 2048) %'transpose.1_pftranspose_1116'[i13_0,i0.128,i1.2048] = indirect_load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (151936, 2048) %'input76'[i0.128,i1.2048] generic generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[0] int32 (128, 2, 1) %'input0_local_1149'[i0.128,i13_0,0] # id=1314, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=2 # dl = tensor_op_name: _gather.41 | hlo_id: 16 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.025 seconds +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.005 seconds +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.032 seconds +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.003 seconds +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.168 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.005 seconds +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.005 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.005 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.003 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 24.731% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[3] bfloat16 (4, 2, 2, 128, 24, 512) %'input84_local_908'[i15_0_0_914_0_1165,i15_0_0_1,c1_902,i0.128,i2.24,i1.128+128p_1330] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (8, 4, 2, 128, 24, 128) %'input84'[2i15_0_0_914_0_1165+i15_0_0_1,p_1330,c1_902,i0.128,i2.24,i1.128] # id=1076, src_id=None, , instances=64 # dl = tensor_op_name: _dot.6 | hlo_id: 49 | [[i0.128];[i1.128, i2.24]] -> [[i0.128];[i1.128, i2.24]] +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 219.783us (48.000MiB, est bw: 229.006GB/s, 23.488% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 24, 128, 4096) %'input85_local_889'[i10_0_0,i10_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 24, 128, 4096) %'input85'[i10_0_0,i10_0_1,i0.128,i1.4096] # id=1067, src_id=None, , instances=48 # dl = tensor_op_name: _dot.4 | hlo_id: 39 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 219.783us (48.000MiB, est bw: 229.006GB/s, 23.488% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 24, 128, 4096) %'input87_local_898'[i12_0_0,i12_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 24, 128, 4096) %'input87'[i12_0_0,i12_0_1,i0.128,i1.4096] # id=1070, src_id=None, , instances=48 # dl = tensor_op_name: _dot.5 | hlo_id: 30 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 74.128us (16.000MiB, est bw: 226.329GB/s, 7.922% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[3] bfloat16 (4, 2, 2, 128, 4096) %'input94_local_929'[i41_0,i41_1_0_1166,i25_0_0_1_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (4, 2, 2, 128, 4096) %'input94'[i41_0,i41_1_0_1166,i25_0_0_1_1,i0.128,i1.4096] # id=1090, src_id=None, , instances=16 # dl = tensor_op_name: _dot.9 | hlo_id: 67 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 74.128us (16.000MiB, est bw: 226.329GB/s, 7.922% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (4, 2, 128, 16, 512) %'input88_local_991'[i115_0_0_0_997_0_1167,i115_0_0_0_1,i0.128,i3.16,i1.128+256p_1350+128i2.2] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 16, 2, 128) %'input88'[2i115_0_0_0_997_0_1167+i115_0_0_0_1,p_1350,i0.128,i3.16,i2.2,i1.128] # id=1145, src_id=None, , instances=16 # dl = tensor_op_name: _dot.10 | hlo_id: 165 | [[i0.128];[i1.128, i2.2, i3.16]] -> [[i0.128];[i1.128, i2.2, i3.16]] +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 2.085% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 2, 128, 4096) %'input92_local_943'[i55_0_1650,i52_0_0_1_1650,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input92'[i55_0_1650,i52_0_0_1_1650,i0.128,i1.4096] # id=1111, src_id=None, , instances=4 # dl = tensor_op_name: _dot.8 | hlo_id: 102 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 2.085% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[0] bfloat16 (128, 32, 512) %'input89_local_979'[i0.128,i2.32,128p_1339+i1.128] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (4, 128, 32, 128) %'input89'[p_1339,i0.128,i2.32,i1.128] # id=1140, src_id=None, , instances=4 # dl = tensor_op_name: _dot.7 | hlo_id: 151 | [[i0.128];[i1.128, i2.32]] -> [[i0.128];[i1.128, i2.32]] +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 10.403us (2.000MiB, est bw: 201.582GB/s, 1.112% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 4096) %'838.1290'[T_i0_0_1652,i0.128,i1.4096] = load bfloat16<128 x 4096> non_local bfloat16 (2, 128, 4096) %'add.4'[T_i0_0_1652,i0.128,i1.4096] # id=1168, src_id=None, , instances=2 # dl = tensor_op_name: add.4_pftranspose_838 | hlo_id: 17 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 10.403us (2.000MiB, est bw: 201.582GB/s, 1.112% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 4096) %'842.1295'[T_i0_0_1653,i0.128,i1.4096] = load bfloat16<128 x 4096> non_local bfloat16 (1048576,) %'all_reduce.1-buffer-1693'[524288T_i0_0_1653+4096i0.128+i1.4096] # id=1177, src_id=None, , instances=2 # dl = tensor_op_name: all_reduce.1_pftranspose_842 | hlo_id: 52 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 8.606us (2.000MiB, est bw: 243.684GB/s, 0.920% of tot. time) for bfloat16<128 x 1024> non_local bfloat16 (1048576,) %'dot.7-buffer-1691'[1024i15_0_0_914_0_1165+4096i0.128+524288i16_0_914_1165+i1.1024] = store bfloat16<128 x 1024> TongaSB partitions[2] bfloat16 (4, 2, 128, 1024) %915[i15_0_0_914_0_1165,i16_0_914_1165,i0.128,i1.1024] # id=1079, src_id=None, , instances=8 # dl = tensor_op_name: _dot.6 | hlo_id: 49 | [[i0.128];[i1.1024]] -> [[i0.128];[i1.1024]] +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/OptimizeNKIKernels]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:53:59Z INFO 49107 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.052 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.037 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-08-07T13:53:59Z INFO 49107 [Tensorizer]: BirCodeGen estimate #instances=2184 in sg0000 +2025-08-07T13:53:59Z INFO 49107 [Tensorizer]: IR signature: f5298a198791041768faa562686532f3bf5ba9e7877f2cc00980d18bca294ec4 for nc00/sg0000/TensorizerBIR +2025-08-07T13:53:59Z INFO 49107 [Tensorizer]: Weights total number of bytes: 246016 +2025-08-07T13:53:59Z INFO 49107 [Tensorizer]: Successfully built model. +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.019 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.003 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49108 [sg0001/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.035 seconds +2025-08-07T13:53:59Z INFO 49108 [Tensorizer]: BirCodeGen estimate #instances=6199 in sg0001 +2025-08-07T13:53:59Z INFO 49108 [Tensorizer]: IR signature: a25fb0b64873495136a890547dec6f3fd0490eda69672640b29d78faee63d6b0 for nc00/sg0001/TensorizerBIR +2025-08-07T13:53:59Z INFO 49108 [Tensorizer]: Weights total number of bytes: 147456 +2025-08-07T13:53:59Z INFO 49108 [Tensorizer]: Successfully built model. +2025-08-07T13:53:59Z INFO 49109 [sg0002/Tensorizer/SimplifyNeuronTensor]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49109 [sg0002/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 1.051 seconds +2025-08-07T13:53:59Z INFO 49109 [sg0002/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:53:59Z INFO 49109 [sg0002/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49109 [sg0002/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.007 seconds +2025-08-07T13:53:59Z INFO 49109 [sg0002/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:53:59Z INFO 49109 [sg0002/Tensorizer/DataStreaming]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/DataStreaming]: DataStreaming finished after 0.036 seconds +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.271 seconds +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.014 seconds +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.015 seconds +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.009 seconds +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 2.705ms (594.000MiB, est bw: 230.258GB/s, 73.200% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (594, 128, 4096) %'698.1075'[i31_0,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (75968, 4096) %'input473'[128i31_0+i0.128,i1.4096] # id=1074, src_id=None, , instances=594 # dl = tensor_op_name: input473_pftranspose_698 | hlo_id: 90 | if -128i31_0-i0.128+75967 >= 0 [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 6.262% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[3] bfloat16 (4, 2, 2, 128, 24, 512) %'input469_local_769'[i15_0_0_775_0_1048,i15_0_0_1,c1_763,i0.128,i2.24,i1.128+128p_2156] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (8, 4, 2, 128, 24, 128) %'input469'[2i15_0_0_775_0_1048+i15_0_0_1,p_2156,c1_763,i0.128,i2.24,i1.128] # id=933, src_id=None, , instances=64 # dl = tensor_op_name: _dot.256 | hlo_id: 59 | [[i0.128];[i1.128, i2.24]] -> [[i0.128];[i1.128, i2.24]] +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 219.783us (48.000MiB, est bw: 229.006GB/s, 5.948% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 24, 128, 4096) %'input470_local_750'[i10_0_0,i10_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 24, 128, 4096) %'input470'[i10_0_0,i10_0_1,i0.128,i1.4096] # id=924, src_id=None, , instances=48 # dl = tensor_op_name: _dot.254 | hlo_id: 49 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 219.783us (48.000MiB, est bw: 229.006GB/s, 5.948% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 24, 128, 4096) %'input472_local_759'[i12_0_0,i12_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 24, 128, 4096) %'input472'[i12_0_0,i12_0_1,i0.128,i1.4096] # id=927, src_id=None, , instances=48 # dl = tensor_op_name: _dot.255 | hlo_id: 40 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 191.807us (297.000KiB, est bw: 1.586GB/s, 5.190% of tot. time) for float32<1 x 128> {'no_delinear': '0'}non_local float32 (1, 75968) %'convert.59'[0,128i31_0+i0.128] = store float32<1 x 128> TongaSB partitions[1] float32 (594, 1, 128) %'dot.257.1085'[i31_0,0,i0.128] # id=1083, src_id=None, , instances=594 # dl = tensor_op_name: _dot.257 | hlo_id: 90 | if -128i31_0-i0.128+75967 >= 0 [[];[i0.128]] -> [[];[i0.128]] +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 22.647us (296.758KiB, est bw: 13.418GB/s, 0.613% of tot. time) for float32<1 x 15194> TongaSB partitions[1] float32 (5, 1, 15194) %'custom-call.411.1154'[i1,0,i0.15194] = load float32<1 x 15194> {'no_delinear': '0'}non_local float32 (1, 75968) %'convert.59'[15194i1+i0.15194] # id=1149, src_id=None, , instances=5 # dl = tensor_op_name: _custom-call.411 | hlo_id: 93 | if -15194i1-i0.15194+75967 >= 0 [[];[i0.15194]] -> [[];[i0.15194]] +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 10.403us (2.000MiB, est bw: 201.582GB/s, 0.282% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 4096) %'702.2135'[T_i0_0_2741,i0.128,i1.4096] = load bfloat16<128 x 4096> non_local bfloat16 (2, 128, 4096) %'add.9'[T_i0_0_2741,i0.128,i1.4096] # id=1049, src_id=None, , instances=2 # dl = tensor_op_name: add.9_pftranspose_702 | hlo_id: 27 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 10.403us (2.000MiB, est bw: 201.582GB/s, 0.282% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 4096) %'706.2140'[T_i0_0_2742,i0.128,i1.4096] = load bfloat16<128 x 4096> non_local bfloat16 (1048576,) %'all_reduce.3-buffer-2756'[524288T_i0_0_2742+4096i0.128+i1.4096] # id=1058, src_id=None, , instances=2 # dl = tensor_op_name: all_reduce.3_pftranspose_706 | hlo_id: 62 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 8.606us (2.000MiB, est bw: 243.684GB/s, 0.233% of tot. time) for bfloat16<128 x 1024> non_local bfloat16 (1048576,) %'dot.14-buffer-2754'[1024i15_0_0_775_0_1048+4096i0.128+524288i16_0_775_1048+i1.1024] = store bfloat16<128 x 1024> TongaSB partitions[2] bfloat16 (4, 2, 128, 1024) %776[i15_0_0_775_0_1048,i16_0_775_1048,i0.128,i1.1024] # id=936, src_id=None, , instances=8 # dl = tensor_op_name: _dot.256 | hlo_id: 59 | [[i0.128];[i1.1024]] -> [[i0.128];[i1.1024]] +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 6.737us (2.000MiB, est bw: 311.309GB/s, 0.182% of tot. time) for bfloat16<128 x 4096> non_local bfloat16 (256, 32, 128) %'convert.57'[128T_i20_714_0+i0.128,i2.4+4i3.8,i1.128] = store bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 8, 512) %'710.2550'[T_i20_714_0,i0.128,i3.8,i1.128+128i2.4] # id=1062, src_id=None, , instances=2 # dl = tensor_op_name: convert.57_pftranspose_710 | hlo_id: 70 | [[i0.128];[i1.128, i2.4, i3.8]] -> [[i0.128];[i1.128, i2.4, i3.8]] +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.012 seconds +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.002 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.450 seconds +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.022 seconds +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-08-07T13:54:00Z WARNING 49109 [sg0002/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 66.92 percent of all matmul computation +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.012 seconds +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-08-07T13:54:00Z INFO 49109 [sg0002/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-08-07T13:54:01Z INFO 49109 [sg0002/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.105 seconds +2025-08-07T13:54:01Z INFO 49109 [sg0002/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-08-07T13:54:01Z INFO 49109 [sg0002/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49109 [sg0002/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.013 seconds +2025-08-07T13:54:01Z INFO 49109 [sg0002/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-08-07T13:54:01Z INFO 49109 [sg0002/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49109 [sg0002/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.104 seconds +2025-08-07T13:54:01Z INFO 49109 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-08-07T13:54:01Z INFO 49109 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49109 [sg0002/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49109 [sg0002/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-08-07T13:54:01Z INFO 49109 [sg0002/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49109 [sg0002/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.463 seconds +2025-08-07T13:54:01Z INFO 49109 [Tensorizer]: BirCodeGen estimate #instances=97138 in sg0002 +2025-08-07T13:54:01Z INFO 49109 [Tensorizer]: IR signature: c138843ed89d72a28292a0c3e01e6e520a0df4f63c8d34d2ea904640b3e0a789 for nc00/sg0002/TensorizerBIR +2025-08-07T13:54:01Z INFO 49109 [Tensorizer]: Weights total number of bytes: 135176 +2025-08-07T13:54:01Z INFO 49109 [Tensorizer]: Successfully built model. +2025-08-07T13:54:01Z USER 47841 [root/Tensorizer/Tensorizer]: Tensorizer finished after 9.854 seconds +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: End tensorization +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input76 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input0 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input79 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input83 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input82 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input1 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input81 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input80 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input78 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input77 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input4 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input2 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input5 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input86 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input87 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input85 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input84 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input90 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input94 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input93 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input92 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input91 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input89 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input88 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input6 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input2 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input7 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input471 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input472 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input470 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input469 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input474 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input1 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input473 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Network input: input3 +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: wrote bir.json +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: wrote tensor_map.json +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: wrote bir.json +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: wrote tensor_map.json +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: wrote bir.json +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: wrote tensor_map.json +2025-08-07T13:54:01Z INFO 47841 [job.Frontend.0]: Job #0 finished +2025-08-07T13:54:01Z INFO 47841 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-08-07T13:54:01Z INFO 47841 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-08-07T13:54:01Z INFO 47841 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-08-07T13:54:01Z INFO 47841 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-08-07T13:54:01Z INFO 47841 [job.WalrusDriver.0]: BackendDriver has 3 states with 1 core LNC +2025-08-07T13:54:01Z INFO 47841 [job.WalrusDriver.0]: BackendDriver MT cwd: /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/neuronxcc-k7bxqych +2025-08-07T13:54:01Z INFO 47841 [job.BIRLinker.1]: Creating directory sgLnk/sg00 +2025-08-07T13:54:01Z INFO 47841 [job.WalrusDriver.0]: StateId sg00 Dir /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/neuronxcc-k7bxqych/sg00 +2025-08-07T13:54:01Z INFO 47841 [job.WalrusDriver.0]: StateId sg01 Dir /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/neuronxcc-k7bxqych/sg01 +2025-08-07T13:54:01Z INFO 47841 [job.WalrusDriver.0]: StateId sg02 Dir /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/neuronxcc-k7bxqych/sg02 +2025-08-07T13:54:01Z INFO 47841 [job.WalrusDriver.0]: Number of subgraphs to link: 3 +2025-08-07T13:54:01Z INFO 47841 [job.WalrusDriver.0]: lnkState: {"model": ["/home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/neuronxcc-k7bxqych/sgLnk/sg00", "state_id": "sgLnk"} +2025-08-07T13:54:01Z INFO 47841 [job.WalrusDriver.0]: BackendDriver in_state.num_states 3 with 1 core LNC +2025-08-07T13:54:01Z INFO 47841 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/log-neuron-cc.txt -o walrus_bir.out.json --enable-call-graph --enable-mt-backend --link-subgraphs sg00,sg01,sg02 --link-dir sgLnk/sg00 --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-lower-bound 0.14 --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --enable-internal-partitioner --dge-levels scalar_dynamic_offset,vector_dynamic_offsets,io --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.neff +2025-08-07T13:54:01Z INFO 47841 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/neuronxcc-k7bxqych +2025-08-07T13:54:01Z INFO 47841 [job.WalrusDriver.0]: propagate_exit=True +2025-08-07T13:54:01Z INFO 47841 [job.WalrusDriver.0]: use_logger=False +2025-08-07T13:54:01Z INFO 47841 [job.WalrusDriver.0]: expose_stderr=True +2025-08-07T13:54:01Z INFO 49187 [Logging]: Logging to ../log-neuron-cc.txt at level 'INFO' +2025-08-07T13:54:01Z INFO 49187 [BackendDriver]: max_allowed_parallelism=128 +2025-08-07T13:54:01Z INFO 49187 [BackendDriver]: Loading module from sg00/bir.json +2025-08-07T13:54:01Z INFO 49187 [BackendDriver]: Loading module from sg01/bir.json +2025-08-07T13:54:01Z INFO 49187 [BackendDriver]: Loading module from sg02/bir.json +2025-08-07T13:54:02Z INFO 49187 [BackendDriver]: Backend driver mtBackend: true numModules: 3 Cwd: "/home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/neuronxcc-k7bxqych" +2025-08-07T13:54:02Z INFO 49187 [BackendDriver]: DynamicDMA is enabled +2025-08-07T13:54:02Z INFO 49187 [BackendDriver]: DynamicDMA levels being enabled: io, scalar_dynamic_offset, vector_dynamic_offsets, +2025-08-07T13:54:02Z INFO 49187 [BackendDriver]: Modular flow call graph is enabled +2025-08-07T13:54:02Z INFO 49187 [BackendDriver]: Internal partitioner is enabled +2025-08-07T13:54:02Z USER 49187 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:02Z INFO 49187 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=670 blocks=3 instructions=1105 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running do_nothing +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=196 blocks=1 instructions=77 Max writers: 3 Max Readers: 10 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 67mb, ru_maxrss: 197mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 196 memory location(s), 1 block(s), and 77 instruction(s). Max writers: 3 Max Readers: 10 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=196 blocks=1 instructions=77 Max writers: 3 Max Readers: 10 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running do_nothing +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: Running do_nothing +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=136 blocks=1 instructions=65 Max writers: 2 Max Readers: 8 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 67mb, ru_maxrss: 197mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 136 memory location(s), 1 block(s), and 65 instruction(s). Max writers: 2 Max Readers: 8 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=136 blocks=1 instructions=65 Max writers: 2 Max Readers: 8 +2025-08-07T13:54:02Z WARNING 49187 [birverifier::InstVisitor]: (sg00) Non - output memory location with no reader: {convert.270.1811}@SB<0,0>(1x2)#Internal DebugInfo: +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=338 blocks=1 instructions=963 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 68mb, ru_maxrss: 197mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 338 memory location(s), 1 block(s), and 963 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=338 blocks=1 instructions=963 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: birverifier finished after 0.004 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 72mb, ru_maxrss: 197mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 196 memory location(s), 1 block(s), and 77 instruction(s). Max writers: 3 Max Readers: 10 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: birverifier finished after 0.013 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 96mb, ru_maxrss: 197mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 136 memory location(s), 1 block(s), and 65 instruction(s). Max writers: 2 Max Readers: 8 +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: birverifier finished after 0.133 seconds +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 247mb, ru_maxrss: 247mb (delta=50mb) +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 338 memory location(s), 1 block(s), and 963 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49187 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:02Z USER 49187 [BackendPassManager]: mod_parallel_pass finished after 0.135 seconds +2025-08-07T13:54:02Z INFO 49187 [BackendPassManager]: curr_vmrss: 240mb, ru_maxrss: 247mb (delta=50mb) +2025-08-07T13:54:02Z INFO 49187 [BackendPassManager]: Output has 3 module(s), 3 function(s), 670 memory location(s), 3 block(s), and 1105 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49187 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:02Z INFO 49187 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=670 blocks=3 instructions=1105 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49187 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:02Z INFO 49187 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=196 blocks=1 instructions=77 Max writers: 3 Max Readers: 10 +2025-08-07T13:54:02Z USER 49187 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [SubgraphForkPass]: curr_vmrss: 240mb, ru_maxrss: 247mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 196 memory location(s), 1 block(s), and 77 instruction(s). Max writers: 3 Max Readers: 10 +2025-08-07T13:54:02Z USER 49187 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:02Z USER 49187 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:02Z INFO 49187 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=136 blocks=1 instructions=65 Max writers: 2 Max Readers: 8 +2025-08-07T13:54:02Z USER 49187 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [SubgraphForkPass]: curr_vmrss: 240mb, ru_maxrss: 247mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 136 memory location(s), 1 block(s), and 65 instruction(s). Max writers: 2 Max Readers: 8 +2025-08-07T13:54:02Z INFO 49187 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=338 blocks=1 instructions=963 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49187 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg02) [SubgraphForkPass]: curr_vmrss: 240mb, ru_maxrss: 247mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 338 memory location(s), 1 block(s), and 963 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49187 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:02Z USER 49187 [BackendPassManager]: subgraph_parallel_pass finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49187 [BackendPassManager]: curr_vmrss: 240mb, ru_maxrss: 247mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 [BackendPassManager]: Output has 3 module(s), 3 function(s), 670 memory location(s), 3 block(s), and 1105 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49187 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:02Z INFO 49187 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=670 blocks=3 instructions=1105 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running expand_replication +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=196 blocks=1 instructions=77 Max writers: 3 Max Readers: 10 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ExpandReplication]: Found 0 replicated matmults +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 240mb, ru_maxrss: 247mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 196 memory location(s), 1 block(s), and 77 instruction(s). Max writers: 3 Max Readers: 10 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running unroll +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=196 blocks=1 instructions=77 Max writers: 3 Max Readers: 10 +2025-08-07T13:54:02Z INFO 49187 (sg00) [Unroll]: INFO (Unroll) Start unrolling at Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: Running expand_replication +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running expand_replication +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=136 blocks=1 instructions=65 Max writers: 2 Max Readers: 8 +2025-08-07T13:54:02Z INFO 49187 (sg01) [ExpandReplication]: Found 0 replicated matmults +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=338 blocks=1 instructions=963 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 240mb, ru_maxrss: 247mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 136 memory location(s), 1 block(s), and 65 instruction(s). Max writers: 2 Max Readers: 8 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running unroll +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=136 blocks=1 instructions=65 Max writers: 2 Max Readers: 8 +2025-08-07T13:54:02Z INFO 49187 (sg01) [Unroll]: INFO (Unroll) Start unrolling at Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg02) [ExpandReplication]: Found 0 replicated matmults +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 240mb, ru_maxrss: 247mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 338 memory location(s), 1 block(s), and 963 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: Running unroll +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=338 blocks=1 instructions=963 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: INFO (Unroll) Start unrolling at Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg00) [Unroll]: INFO (Unroll) DONE unrolling Thu Aug 7 13:54:02 2025 + +2025-08-07T13:54:02Z INFO 49187 (sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-08-07T13:54:02Z INFO 49187 (sg00) [Unroll]: Total count: 2144 +2025-08-07T13:54:02Z INFO 49187 (sg00) [Unroll]: Matmult: 1216 +2025-08-07T13:54:02Z INFO 49187 (sg00) [Unroll]: GenericCopy: 222 +2025-08-07T13:54:02Z INFO 49187 (sg00) [Unroll]: TensorTensor: 216 +2025-08-07T13:54:02Z INFO 49187 (sg00) [Unroll]: TensorScalarPtr: 184 +2025-08-07T13:54:02Z INFO 49187 (sg00) [Unroll]: Activation: 91 +2025-08-07T13:54:02Z INFO 49187 (sg00) [Unroll]: Load: 89 +2025-08-07T13:54:02Z INFO 49187 (sg00) [Unroll]: Save: 36 +2025-08-07T13:54:02Z INFO 49187 (sg00) [Unroll]: TensorReduce: 32 +2025-08-07T13:54:02Z INFO 49187 (sg00) [Unroll]: DMACopy: 19 +2025-08-07T13:54:02Z INFO 49187 (sg00) [Unroll]: Memset: 17 +2025-08-07T13:54:02Z INFO 49187 (sg00) [Unroll]: TensorScalarAffineSelect: 16 +2025-08-07T13:54:02Z INFO 49187 (sg00) [Unroll]: Reciprocal: 2 +2025-08-07T13:54:02Z INFO 49187 (sg00) [Unroll]: CollectiveCompute: 2 +2025-08-07T13:54:02Z INFO 49187 (sg00) [Unroll]: Iota: 2 +2025-08-07T13:54:02Z INFO 49187 (sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 18 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: unroll finished after 0.025 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 282mb (delta=35mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 988 memory location(s), 1 block(s), and 2144 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg01) [Unroll]: INFO (Unroll) DONE unrolling Thu Aug 7 13:54:02 2025 + +2025-08-07T13:54:02Z INFO 49187 (sg01) [Unroll]: sg0001 Instruction count after Unroll: +2025-08-07T13:54:02Z INFO 49187 (sg01) [Unroll]: Total count: 6199 +2025-08-07T13:54:02Z INFO 49187 (sg01) [Unroll]: Matmult: 5198 +2025-08-07T13:54:02Z INFO 49187 (sg01) [Unroll]: Load: 215 +2025-08-07T13:54:02Z INFO 49187 (sg01) [Unroll]: GenericCopy: 190 +2025-08-07T13:54:02Z INFO 49187 (sg01) [Unroll]: TensorTensor: 184 +2025-08-07T13:54:02Z INFO 49187 (sg01) [Unroll]: Activation: 166 +2025-08-07T13:54:02Z INFO 49187 (sg01) [Unroll]: TensorScalarPtr: 147 +2025-08-07T13:54:02Z INFO 49187 (sg01) [Unroll]: TensorReduce: 32 +2025-08-07T13:54:02Z INFO 49187 (sg01) [Unroll]: Save: 19 +2025-08-07T13:54:02Z INFO 49187 (sg01) [Unroll]: DMACopy: 18 +2025-08-07T13:54:02Z INFO 49187 (sg01) [Unroll]: Select: 16 +2025-08-07T13:54:02Z INFO 49187 (sg01) [Unroll]: Memset: 10 +2025-08-07T13:54:02Z INFO 49187 (sg01) [Unroll]: Reciprocal: 2 +2025-08-07T13:54:02Z INFO 49187 (sg01) [Unroll]: CollectiveCompute: 2 +2025-08-07T13:54:02Z INFO 49187 (sg01) [Unroll]: Unrolled DGE count with Dynamic AP: 16 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: unroll finished after 0.075 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 321mb (delta=74mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1156 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: INFO (Unroll) DONE unrolling Thu Aug 7 13:54:02 2025 + +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: sg0002 Instruction count after Unroll: +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: Total count: 51393 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: Matmult: 42707 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: GenericCopy: 6058 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: Load: 788 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: Save: 619 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: Max: 224 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: MaxIndex: 224 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: MatchReplace: 217 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: TensorScalarPtr: 214 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: TensorTensor: 153 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: Activation: 117 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: Gather: 35 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: Memset: 12 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: TensorReduce: 8 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: StreamShuffle: 4 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: CollectiveCompute: 3 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: Select: 3 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: Reciprocal: 3 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: Iota: 2 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: DMACopy: 2 +2025-08-07T13:54:02Z INFO 49187 (sg02) [Unroll]: Unrolled DGE count with Dynamic AP: 1 +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: unroll finished after 0.505 seconds +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 538mb, ru_maxrss: 538mb (delta=291mb) +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9923 memory location(s), 1 block(s), and 51393 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:02Z USER 49187 [BackendPassManager]: mod_parallel_pass finished after 0.515 seconds +2025-08-07T13:54:02Z INFO 49187 [BackendPassManager]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=291mb) +2025-08-07T13:54:02Z INFO 49187 [BackendPassManager]: Output has 3 module(s), 3 function(s), 12067 memory location(s), 3 block(s), and 59736 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:02Z INFO 49187 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=12067 blocks=3 instructions=59736 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 (sg00) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:02Z INFO 49187 (sg00) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=988 blocks=1 instructions=2144 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg01) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:02Z USER 49187 (sg02) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:02Z INFO 49187 (sg01) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=1156 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg02) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=9923 blocks=1 instructions=51393 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z INFO 49187 (sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49187 (sg01) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49187 (sg01) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg01) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49187 (sg01) [SubgraphForkPass]: dead_code_elim finished after 0.006 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [SubgraphForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 1087 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49187 (sg00) [SubgraphForkPass]: dead_code_elim finished after 0.007 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [SubgraphForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 876 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49187 (sg02) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg02) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg02) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49187 (sg02) [SubgraphForkPass]: dead_code_elim finished after 0.054 seconds +2025-08-07T13:54:02Z INFO 49187 (sg02) [SubgraphForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 9890 memory location(s), 1 block(s), and 51393 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:02Z USER 49187 [BackendPassManager]: subgraph_parallel_pass finished after 0.056 seconds +2025-08-07T13:54:02Z INFO 49187 [BackendPassManager]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 [BackendPassManager]: Output has 3 module(s), 3 function(s), 11853 memory location(s), 3 block(s), and 59735 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:02Z INFO 49187 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=11853 blocks=3 instructions=59735 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=876 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=1087 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=9890 blocks=1 instructions=51393 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: birverifier finished after 0.003 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 876 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: birverifier finished after 0.007 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1087 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: birverifier finished after 0.081 seconds +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9890 memory location(s), 1 block(s), and 51393 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:02Z USER 49187 [BackendPassManager]: mod_parallel_pass finished after 0.083 seconds +2025-08-07T13:54:02Z INFO 49187 [BackendPassManager]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 [BackendPassManager]: Output has 3 module(s), 3 function(s), 11853 memory location(s), 3 block(s), and 59735 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:02Z INFO 49187 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=11853 blocks=3 instructions=59735 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:02Z INFO 49187 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=876 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [SubgraphForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 876 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:02Z USER 49187 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:02Z INFO 49187 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=1087 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [SubgraphForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 1087 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=9890 blocks=1 instructions=51393 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg02) [SubgraphForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 9890 memory location(s), 1 block(s), and 51393 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:02Z USER 49187 [BackendPassManager]: subgraph_parallel_pass finished after 0.002 seconds +2025-08-07T13:54:02Z INFO 49187 [BackendPassManager]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 [BackendPassManager]: Output has 3 module(s), 3 function(s), 11853 memory location(s), 3 block(s), and 59735 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:02Z INFO 49187 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=11853 blocks=3 instructions=59735 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running instruction_reorder +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=876 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: instruction_reorder finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running instruction_reorder +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 876 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: Running instruction_reorder +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running psum_legalization +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=1087 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=876 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 876 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=876 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=9890 blocks=1 instructions=51393 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 876 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running pre_opts +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=876 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreOpts]: Skipped. No pre-opt passes enabled +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 876 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running error_injector +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=876 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z WARNING 49187 (sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 876 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running vn_splitter +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=876 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: instruction_reorder finished after 0.002 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1087 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running psum_legalization +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=1087 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 1 nodes. Total savings 126 bytes/partition +2025-08-07T13:54:02Z INFO 49187 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-08-07T13:54:02Z INFO 49187 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0 seconds +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: vn_splitter finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 876 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running constant_propagate +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=876 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: psum_legalization finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1087 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running legalize_cce_dma +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=1087 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: legalize_cce_dma finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1087 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running pre_opts +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=1087 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreOpts]: Skipped. No pre-opt passes enabled +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1087 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running error_injector +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=1087 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z WARNING 49187 (sg01) [ErrorInjector]: Unrecognized injected error value "0" +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1087 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running vn_splitter +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=1087 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg01) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 1 +2025-08-07T13:54:02Z INFO 49187 (sg01) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49187 (sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49187 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-08-07T13:54:02Z INFO 49187 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-08-07T13:54:02Z INFO 49187 (sg01) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.001 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.001 seconds +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: vn_splitter finished after 0.004 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1087 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running constant_propagate +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=1087 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: constant_propagate finished after 0.005 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 876 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running lower_ac +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=876 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 876 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=876 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 876 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running remat_optimization +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=876 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [RematOpt]: Removed 0 remat instructions +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: remat_optimization finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 876 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running early_peephole_opts +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=876 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-08-07T13:54:02Z INFO 49187 (sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49187 (sg00) [EarlyPeepholeOpts]: Activation Accumulate: 0 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: early_peephole_opts finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 876 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=876 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: instruction_reorder finished after 0.009 seconds +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 876 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running infer_stream_ids +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=876 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 876 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running pre_sched +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=876 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 [LayerSpiller]: LayerSpill: Start... +2025-08-07T13:54:02Z INFO 49187 [LayerSpiller]: LayerSpill: Found 1 Splits CCs +2025-08-07T13:54:02Z INFO 49187 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-08-07T13:54:02Z INFO 49187 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-08-07T13:54:02Z INFO 49187 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-08-07T13:54:02Z INFO 49187 [LayerSpiller]: LayerSpill: Done. +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: Start split live ranges Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9890 memory location(s), 1 block(s), and 51393 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: Running psum_legalization +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: No split opportunities: +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: End split live ranges Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: Strt remove redundncies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: remove_redundant_memsets +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=9890 blocks=1 instructions=51393 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z INFO 49187 (sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: remove_redundant_memsets: 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: remove_redundant_loads +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: remove_redundant_loads: 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: End remove redundncies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg01) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: Start DCE Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: End DCE Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: Start build flow dependencies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg00) [build_flow_deps]: Start build fdeps. Invocation: 1Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg00) [build_flow_deps]: Allocs: 876 instructions: 2143 +2025-08-07T13:54:02Z INFO 49187 (sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: constant_propagate finished after 0.008 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 382mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: psum_legalization finished after 0.005 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1087 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running lower_ac +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 382mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=1087 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg01) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 382mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1087 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running input_dma_coalescing +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=1087 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9890 memory location(s), 1 block(s), and 51393 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: Running legalize_cce_dma +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=9890 blocks=1 instructions=51393 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: input_dma_coalescing finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 382mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1087 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running remat_optimization +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=1087 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg01) [RematOpt]: Removed 0 remat instructions +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: remat_optimization finished after 0.003 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 382mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1087 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running early_peephole_opts +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=1087 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg01) [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-08-07T13:54:02Z INFO 49187 (sg00) [build_flow_deps]: Build fdeps inserted 5443 edges +2025-08-07T13:54:02Z INFO 49187 (sg00) [build_flow_deps]: Done build fdeps 5443 Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: End build flow dependencies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: Start remove useless insts Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: remove_useless_insts +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: remove Useless Instructions: 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: End remove useless insts Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: Start scratchpad optimization Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg01) [EarlyPeepholeOpts]: Activation Accumulate: 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: End scratchpad optimization Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: early_peephole_opts finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 382mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1087 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=1087 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 382mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1087 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running infer_stream_ids +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=1087 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 382mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1087 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running pre_sched +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=1087 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: Start PRE scheduling 2 cores: 1 at: Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 [LayerSpiller]: LayerSpill: Start... +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: legalize_cce_dma finished after 0.005 seconds +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 382mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-08-07T13:54:02Z INFO 49187 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-08-07T13:54:02Z INFO 49187 (sg00) [PreSched]: DONE PRE scheduling Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9890 memory location(s), 1 block(s), and 51393 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: Running pre_opts +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: pre_sched finished after 0.012 seconds +2025-08-07T13:54:02Z INFO 49187 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-08-07T13:54:02Z INFO 49187 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-08-07T13:54:02Z INFO 49187 [LayerSpiller]: LayerSpill: Done. +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 382mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: Start split live ranges Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 876 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=876 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=9890 blocks=1 instructions=51393 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z INFO 49187 (sg02) [PreOpts]: Skipped. No pre-opt passes enabled +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 382mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: No split opportunities: +2025-08-07T13:54:02Z INFO 49187 (sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: End split live ranges Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: Strt remove redundncies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: remove_redundant_memsets +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9890 memory location(s), 1 block(s), and 51393 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: Running error_injector +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: remove_redundant_memsets: 0 +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=9890 blocks=1 instructions=51393 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z WARNING 49187 (sg02) [ErrorInjector]: Unrecognized injected error value "0" +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 382mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: remove_redundant_loads +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9890 memory location(s), 1 block(s), and 51393 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: Running vn_splitter +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: remove_redundant_loads: 0 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: End remove redundncies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: Start DCE Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=9890 blocks=1 instructions=51393 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z INFO 49187 (sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49187 (sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg02) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 14 +2025-08-07T13:54:02Z INFO 49187 (sg02) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.003 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 382mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 876 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=876 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 382mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 877 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=877 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 382mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 877 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=877 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:02Z INFO 49187 (sg00) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: allocating PSUM +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: main loop +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: renumber locations +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: size = 226 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: build_no_bitmap start +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: found 380 edges +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: mean: 3.36283 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: median: 3.04437 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: adjacency vectors require 3040 bytes +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: build_no_bitmap done +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: find costs +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: End DCE Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: simplify interference graph +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: initialize low and high +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: lo = 226 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: hi = 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: inf = 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: total = 226 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: simplify +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: new candidates = 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: select ranges +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: no more spills +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-08-07T13:54:02Z INFO 49187 (sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.005 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 383mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 877 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=877 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: Start build flow dependencies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg01) [build_flow_deps]: Start build fdeps. Invocation: 2Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg01) [build_flow_deps]: Allocs: 1087 instructions: 6199 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 383mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 877 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running address_rotation_psum +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=877 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 38 PSUM Banks +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 8 PSUM Banks +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 37 PSUM Banks +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: address_rotation_psum finished after 0.003 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 383mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 877 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=877 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 46189568 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 4508 bytes +2025-08-07T13:54:02Z INFO 49187 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 5439490 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 1464 bytes +2025-08-07T13:54:02Z INFO 49187 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 1582080 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 343 bytes +2025-08-07T13:54:02Z INFO 49187 (sg00) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:02Z INFO 49187 (sg00) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: allocating SB +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: main loop +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: renumber locations +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: size = 620 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: find partners +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: found 119 accumulation groups +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: largest = _dot-t1291_i0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: tensors = 33 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: requires 49152 bytes/partition +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: expanding partners +2025-08-07T13:54:02Z INFO 49187 []: find first defs for local +2025-08-07T13:54:02Z INFO 49187 []: find first defs for global +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: find loads +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: 1 pin count +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: 65 remat count +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: build interference graph +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: pass 1 int-tree +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: Num intervals 620 Num locations 620 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: info.neighbors init Done +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: edge: 21704 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: mean: 70.0129 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: median: 58.0889 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: find costs +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: simplify interference graph +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: safe = 438 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: unsafe = 165 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: inf = 16 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: total = 619 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: simplify +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 141 #Pinned 0 #Safe 0 minCost 0.00397913 maxCost 0.0961667 locations 620 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: new candidates = 14 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: select ranges +2025-08-07T13:54:02Z INFO 49187 (sg01) [build_flow_deps]: Build fdeps inserted 19695 edges +2025-08-07T13:54:02Z INFO 49187 (sg01) [build_flow_deps]: Done build fdeps 19695 Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: End build flow dependencies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: Start remove useless insts Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: remove_useless_insts +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: Total: 619 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: Allocated: 1.000 (619) +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: Rover zone: 0.698 (432) +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: Pre-rover zone: 0.036 (22) +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: Post-rover zone: 0.267 (165) +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: Blocks nothing: 0.013 (8) +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: Blocks medium: 0.006 (4) +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: Visited until medium blocking (mean): 0.750 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: Visited until medium blocking (median): 0.812 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: Visited until medium blocking (p95): 0.812 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: Blocks tall: 0.981 (607) +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.972 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: Success +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: SB spills = 0 tensors +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: remats = 0 tensors +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: SB score = 0 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: remove Useless Instructions: 0 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: End remove useless insts Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: Start scratchpad optimization Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: pinning saved approximately 9010 cycles +2025-08-07T13:54:02Z INFO 49187 (sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-08-07T13:54:02Z INFO 49187 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 46189568 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: End scratchpad optimization Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 4508 bytes +2025-08-07T13:54:02Z INFO 49187 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 5439490 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 1464 bytes +2025-08-07T13:54:02Z INFO 49187 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 1582080 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 343 bytes +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.010 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 384mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 877 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=877 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: address_rotation_sb finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 384mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 877 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=877 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 51629058, 81.3404% input load, 4.44277% output write, 14.2169% spill/reload [sg0000] +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: removed 0 identical load +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(4.19953e+07) +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z INFO 49187 (sg02) [ShrinkDN]: INFO (ShrinkDN): Shrunk 2 nodes. Total savings 14336 bytes/partition +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49187 (sg01) [PreSched]: DONE PRE scheduling Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: pre_sched finished after 0.032 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 384mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1087 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=1087 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: average loaded DMA size 4508 bytes +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: average saved DMA size 1464 bytes +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 46189568 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 4508 bytes +2025-08-07T13:54:02Z INFO 49187 (sg01) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 5439490 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 1464 bytes +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 0, 0% out of total dma traffic +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 51629058, 81.3404% input load, 4.44277% output write, 14.2169% spill/reload [sg0000] +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 46189568 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 4508 bytes +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 5439490 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 1464 bytes +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 1582080 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 343 bytes +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 2864 bytes +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.011 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 384mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: SB Rotation rotated 3 Sb address +2025-08-07T13:54:02Z INFO 49187 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-08-07T13:54:02Z INFO 49187 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-08-07T13:54:02Z INFO 49187 (sg02) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-08-07T13:54:02Z INFO 49187 (sg02) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.01 seconds +2025-08-07T13:54:02Z INFO 49187 (sg02) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.013 seconds +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: vn_splitter finished after 0.036 seconds +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 384mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: SB Rotation rotated 20 Sb address +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9890 memory location(s), 1 block(s), and 51393 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: Running constant_propagate +2025-08-07T13:54:02Z INFO 49187 (sg01) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg01) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=9890 blocks=1 instructions=51393 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z INFO 49187 (sg01) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.009 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 384mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1087 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running dynamic_dma_setup +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=1087 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: SB Rotation rotated 14 Sb address +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 384mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1088 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running runtime_memory_reservation +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=1088 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 384mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1088 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running coloring_allocator_psum +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=1088 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg01) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:02Z INFO 49187 (sg01) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: allocating PSUM +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: main loop +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: renumber locations +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: size = 278 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: build_no_bitmap start +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: found 382 edges +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: mean: 2.7482 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: median: 2.20739 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: adjacency vectors require 3056 bytes +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: build_no_bitmap done +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: find costs +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: SB Rotation rotated 80 Sb address +2025-08-07T13:54:02Z INFO 49187 (sg02) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: address_rotation_sb finished after 0.013 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:02Z INFO 49187 (sg00) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: reserved space = 672926470 bytes +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: spill space = 7340032 bytes +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: aligned spill space = 7340032 bytes +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: renumber locations +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: size = 4 +2025-08-07T13:54:02Z INFO 49187 []: find first defs for local +2025-08-07T13:54:02Z INFO 49187 []: find first defs for global +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: Num intervals 4 Num locations 4 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: simplify interference graph +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: initialize low and high +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: lo = 4 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: hi = 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: total = 4 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: simplify +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: select ranges +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: allreduce_dram_hwm 7340032 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: Real CC buffer size 7340032 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: DRAM hwm after allocation: 7340032 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.003 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running address_rotation_dram +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: DRAM hwm before rotation 7340032 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: allreduce hwm 7340032 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: Real CC buffer size 7340032 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: DRAM hwm after rotation 7340032 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: address_rotation_dram finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-08-07T13:54:02Z INFO 49187 (sg00) [TensorCopyAccel::Impl]: Accelerated 0 out of 238 tensorcopy in Function: sg0000 average acceleration factor: -nan +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running peephole_opts +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: peephole_opts finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running lower_kernel +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [LowerKernel]: Started running LowerKernel +2025-08-07T13:54:02Z INFO 49187 (sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 2143, number of allocs: 875 +2025-08-07T13:54:02Z INFO 49187 (sg00) [LowerKernel]: Scan BKs time (s): 6.5e-05 +2025-08-07T13:54:02Z INFO 49187 (sg00) [LowerKernel]: Lower BKs time (s): 5e-06 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: lower_kernel finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: birverifier finished after 0.002 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running build_fdeps +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [build_flow_deps]: Start build fdeps. Invocation: 3Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg00) [build_flow_deps]: Allocs: 875 instructions: 2143 +2025-08-07T13:54:02Z INFO 49187 (sg00) [build_flow_deps]: Build fdeps inserted 5443 edges +2025-08-07T13:54:02Z INFO 49187 (sg00) [build_flow_deps]: Done build fdeps 5443 Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: build_fdeps finished after 0.004 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running remove_redundancies +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-08-07T13:54:02Z INFO 49187 (sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49187 (sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [RemoveRedundancies]: remove_useless_insts +2025-08-07T13:54:02Z INFO 49187 (sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: remove_redundancies finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:02Z INFO 49187 (sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:02Z INFO 49187 (sg00) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.017 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 387mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:02Z INFO 49187 (sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.004 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 387mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 387mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running post_sched +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 [post_scheduler]: Start PosT ScheD 3 sunda Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: simplify interference graph +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: initialize low and high +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: lo = 278 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: hi = 0 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: inf = 0 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: total = 278 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: simplify +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: new candidates = 0 +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: select ranges +2025-08-07T13:54:02Z INFO 49187 (sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: no more spills +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-08-07T13:54:02Z INFO 49187 (sg01) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: coloring_allocator_psum finished after 0.051 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 387mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1088 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running dma_optimization_psum +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=1088 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: dma_optimization_psum finished after 0.004 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 387mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1088 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running address_rotation_psum +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=1088 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 [post_scheduler]: Time-aware hwm post-sched +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 24 PSUM Banks +2025-08-07T13:54:02Z INFO 49187 (sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 6 PSUM Banks +2025-08-07T13:54:02Z INFO 49187 (sg02) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 14 PSUM Banks +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: address_rotation_psum finished after 0.013 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 387mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1088 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running coloring_allocator_sb +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=1088 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 197517824 +2025-08-07T13:54:02Z INFO 49187 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 7177 bytes +2025-08-07T13:54:02Z INFO 49187 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 6291458 +2025-08-07T13:54:02Z INFO 49187 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2729 bytes +2025-08-07T13:54:02Z INFO 49187 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 532480 +2025-08-07T13:54:02Z INFO 49187 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-08-07T13:54:02Z INFO 49187 (sg01) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:02Z INFO 49187 (sg01) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: allocating SB +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: main loop +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: renumber locations +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: size = 777 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: find partners +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: found 256 accumulation groups +2025-08-07T13:54:02Z INFO 49187 [post_scheduler]: Time-aware simulation time: 539867 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: largest = _dot.6-t1032_i15 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: tensors = 50 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: requires 73728 bytes/partition +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: expanding partners +2025-08-07T13:54:02Z INFO 49187 [post_scheduler]: Done PosT ScheD Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: post_sched finished after 0.032 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 387mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 387mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 []: find first defs for local +2025-08-07T13:54:02Z INFO 49187 []: find first defs for global +2025-08-07T13:54:02Z INFO 49187 (sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: find loads +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: 1 pin count +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: 129 remat count +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: build interference graph +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: pass 1 int-tree +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: Num intervals 777 Num locations 777 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: info.neighbors init Done +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 52 PSUM Banks +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: edge: 37107 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: mean: 95.5135 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: median: 78.5003 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: find costs +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 36 PSUM Banks +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: simplify interference graph +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: safe = 419 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: unsafe = 233 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: inf = 124 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: total = 776 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: simplify +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 25 PSUM Banks +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 225 #Pinned 0 #Safe 0 minCost 0.00525312 maxCost 0.0879926 locations 777 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: new candidates = 125 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: select ranges +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: Total: 776 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: Spilled: 0.000 (0) +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: Allocated: 1.000 (776) +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: Rover zone: 0.655 (508) +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: Pre-rover zone: 0.013 (10) +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: Post-rover zone: 0.332 (258) +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: Blocks tall: 1.000 (776) +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: Visited until tall blocking (mean): 0.999 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: Success +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: SB Rotation rotated 4 Sb address +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: SB Rotation rotated 19 Sb address +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: SB spills = 0 tensors +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: remats = 0 tensors +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: SB score = 0 +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: pinning saved approximately 9010 cycles +2025-08-07T13:54:02Z INFO 49187 (sg01) [SB_Allocator]: 0% SB utilization after allocation +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z INFO 49187 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 197517824 +2025-08-07T13:54:02Z INFO 49187 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 7177 bytes +2025-08-07T13:54:02Z INFO 49187 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 6291458 +2025-08-07T13:54:02Z INFO 49187 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2729 bytes +2025-08-07T13:54:02Z INFO 49187 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 532480 +2025-08-07T13:54:02Z INFO 49187 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: coloring_allocator_sb finished after 0.023 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 387mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1088 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=1088 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: SB Rotation rotated 75 Sb address +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z INFO 49187 (sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: address_rotation_sb finished after 0.020 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 387mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:02Z INFO 49187 (sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:02Z INFO 49187 (sg00) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: address_rotation_sb finished after 0.003 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 387mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1088 memory location(s), 1 block(s), and 6199 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running dma_optimization_sb +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=1088 blocks=1 instructions=6199 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 203809282, 94.8551% input load, 1.02898% output write, 4.11591% spill/reload [sg0001] +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: removed 0 identical load +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: sub-graph will get execute 35 times +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 131072, 0.0643111% out of total dma traffic(1.93324e+08) +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z INFO 49187 (sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: average loaded DMA size 7239 bytes +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: average saved DMA size 2729 bytes +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 197386752 +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 7239 bytes +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.017 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 6291458 +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2729 bytes +2025-08-07T13:54:02Z INFO 49187 (sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg00) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:02Z INFO 49187 (sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-08-07T13:54:02Z INFO 49187 (sg00) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 131072, 0.0643111% out of total dma traffic +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 203678210, 94.8518% input load, 1.02964% output write, 4.11856% spill/reload [sg0001] +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 197386752 +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 7239 bytes +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 6291458 +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2729 bytes +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 532480 +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 130 bytes +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 6065 bytes +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running dep_opt +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-08-07T13:54:02Z INFO 49187 (sg00) [build_flow_deps]: Start build fdeps. Invocation: 4Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: dma_optimization_sb finished after 0.018 seconds +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6197 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z USER 49187 (sg01) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:02Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=1085 blocks=1 instructions=6197 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:02Z INFO 49187 (sg00) [build_flow_deps]: Allocs: 875 instructions: 2143 +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: SB Rotation rotated 7 Sb address +2025-08-07T13:54:02Z INFO 49187 (sg00) [build_flow_deps]: Build fdeps inserted 5309 edges +2025-08-07T13:54:02Z INFO 49187 (sg00) [build_flow_deps]: Done build fdeps 5309 Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49187 (sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: SB Rotation rotated 72 Sb address +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: dep_opt finished after 0.006 seconds +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: Running report_stats +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: constant_propagate finished after 0.121 seconds +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49187 (sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 2 │ 1244659712 │ +│ DMACopy │ Internal -> ExternalOutput │ 16 │ 16777216 │ +│ DMACopy │ Internal -> Output │ 1 │ 4194304 │ +│ Load │ Const -> Internal │ 4 │ 41472 │ +│ Load │ ExternalInput -> Internal │ 45 │ 41953792 │ +│ Load │ Internal │ 40 │ 4194304 │ +│ Save │ Internal │ 24 │ 3145728 │ +│ Save │ Internal -> Output │ 12 │ 2293762 │ +└─────────────┴────────────────────────────┴───────┴────────────┘ + +2025-08-07T13:54:02Z INFO 49187 (sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 64 │ 2 │ +│ 256 │ 2 │ +│ 512 │ 52 │ +│ 1024 │ 1 │ +│ 2048 │ 8 │ +│ 4096 │ 2 │ +│ 8192 │ 56 │ +│ 262144 │ 16 │ +│ 2097152 │ 2 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:02Z INFO 49187 (sg00) [ReportStats]: MM Stats: #MatMults 1216 #MatMult-Transposes 164 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ReportStats]: IO Tensor size combined: 668477956 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input76 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input77 │ ExternalInput │ bfloat16 │ 16777216 │ +│ input83 │ ExternalInput │ bfloat16 │ 16777216 │ +│ input78 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input81 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input5 │ ExternalInput │ bfloat16 │ 1048576 │ +│ input4 │ ExternalInput │ bfloat16 │ 1048576 │ +│ output1 │ ExternalOutput │ bfloat16 │ 1048576 │ +│ output2 │ ExternalOutput │ bfloat16 │ 1048576 │ +│ input79 │ ExternalInput │ bfloat16 │ 8192 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-08-07T13:54:02Z INFO 49187 (sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────┼──────────┼──────────┼──────────────┤ +│ input78_local_1233 │ Internal │ bfloat16 │ 4194304 │ +│ input77_local_1245_i1 │ Internal │ bfloat16 │ 2097152 │ +│ dot.4-buffer-2167 │ Internal │ bfloat16 │ 2097152 │ +│ intermediate4-buffer-2169 │ Internal │ bfloat16 │ 2097152 │ +│ intermediate1 │ Output │ bfloat16 │ 2097152 │ +│ input77_local_1245_i0 │ Internal │ bfloat16 │ 2097152 │ +│ input77_local_1245_i3 │ Internal │ bfloat16 │ 2097152 │ +│ input77_local_1245_i2 │ Internal │ bfloat16 │ 2097152 │ +│ all_gather.1 │ Internal │ bfloat16 │ 2097152 │ +│ intermediate4 │ Output │ bfloat16 │ 2097152 │ +└───────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-08-07T13:54:02Z USER 49187 (sg00) [ModuleForkPass]: report_stats finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9890 memory location(s), 1 block(s), and 51393 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:02Z USER 49187 (sg02) [ModuleForkPass]: Running lower_ac +2025-08-07T13:54:02Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:02Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=9890 blocks=1 instructions=51393 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: SB Rotation rotated 7 Sb address +2025-08-07T13:54:02Z INFO 49187 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:03Z INFO 49187 (sg02) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: lower_ac finished after 0.008 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [DMAOptimizationBase]: SB Rotation rotated 38 Sb address +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9890 memory location(s), 1 block(s), and 51393 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: Running input_dma_coalescing +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=9890 blocks=1 instructions=51393 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z INFO 49187 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: address_rotation_sb finished after 0.017 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6197 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running coloring_allocator_dram +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=1085 blocks=1 instructions=6197 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z INFO 49187 (sg01) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:03Z INFO 49187 (sg01) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: reserved space = 205775368 bytes +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: spill space = 10485760 bytes +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: aligned spill space = 10485760 bytes +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: renumber locations +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: size = 5 +2025-08-07T13:54:03Z INFO 49187 []: find first defs for local +2025-08-07T13:54:03Z INFO 49187 []: find first defs for global +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: Num intervals 5 Num locations 5 +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: simplify interference graph +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: initialize low and high +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: lo = 5 +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: hi = 0 +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: total = 5 +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: simplify +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: select ranges +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: allreduce_dram_hwm 8388608 +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: Real CC buffer size 8388608 +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: DRAM hwm after allocation: 10485760 +2025-08-07T13:54:03Z INFO 49187 (sg01) [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: coloring_allocator_dram finished after 0.006 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6197 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running address_rotation_dram +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=1085 blocks=1 instructions=6197 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z INFO 49187 (sg01) [DMAOptimizationBase]: Runtime page size at 512MB +2025-08-07T13:54:03Z INFO 49187 (sg01) [DMAOptimizationBase]: DRAM hwm before rotation 10485760 +2025-08-07T13:54:03Z INFO 49187 (sg01) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-08-07T13:54:03Z INFO 49187 (sg01) [DMAOptimizationBase]: allreduce hwm 8388608 +2025-08-07T13:54:03Z INFO 49187 (sg01) [DMAOptimizationBase]: Real CC buffer size 8388608 +2025-08-07T13:54:03Z INFO 49187 (sg01) [DMAOptimizationBase]: DRAM hwm after rotation 10485760 +2025-08-07T13:54:03Z INFO 49187 (sg01) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: address_rotation_dram finished after 0.002 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6197 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running tensorcopy_accel +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=1085 blocks=1 instructions=6197 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z INFO 49187 (sg01) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-08-07T13:54:03Z INFO 49187 (sg01) [TensorCopyAccel::Impl]: Accelerated 8 out of 200 tensorcopy in Function: sg0001 average acceleration factor: 1 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6197 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running peephole_opts +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=1085 blocks=1 instructions=6197 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z INFO 49187 (sg01) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: peephole_opts finished after 0.001 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running lower_kernel +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z INFO 49187 (sg01) [LowerKernel]: Started running LowerKernel +2025-08-07T13:54:03Z INFO 49187 (sg01) [LowerKernel]: Start of kernel lowering pass, number of insts: 6213, number of allocs: 1085 +2025-08-07T13:54:03Z INFO 49187 (sg01) [LowerKernel]: Scan BKs time (s): 0.001229 +2025-08-07T13:54:03Z INFO 49187 (sg01) [LowerKernel]: Lower BKs time (s): 5e-06 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: lower_kernel finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: birverifier finished after 0.004 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running dynamic_dma_scan +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z INFO 49187 (sg02) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: input_dma_coalescing finished after 0.016 seconds +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: dynamic_dma_scan finished after 0.001 seconds +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9890 memory location(s), 1 block(s), and 51393 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: Running remat_optimization +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running build_fdeps +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z INFO 49187 (sg01) [build_flow_deps]: Start build fdeps. Invocation: 5Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=9890 blocks=1 instructions=51393 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z INFO 49187 (sg01) [build_flow_deps]: Allocs: 1085 instructions: 6213 +2025-08-07T13:54:03Z INFO 49187 (sg01) [build_flow_deps]: Build fdeps inserted 19743 edges +2025-08-07T13:54:03Z INFO 49187 (sg01) [build_flow_deps]: Done build fdeps 19743 Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: build_fdeps finished after 0.011 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 389mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running remove_redundancies +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z INFO 49187 (sg01) [RemoveRedundancies]: remove_clobbered_writes +2025-08-07T13:54:03Z INFO 49187 (sg01) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-08-07T13:54:03Z INFO 49187 (sg01) [RemoveRedundancies]: remove_useless_insts +2025-08-07T13:54:03Z INFO 49187 (sg01) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: remove_redundancies finished after 0.002 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 389mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z INFO 49187 (sg01) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:03Z INFO 49187 (sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:03Z INFO 49187 (sg01) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.047 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 391mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z INFO 49187 (sg01) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:03Z INFO 49187 (sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.005 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 391mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 391mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running post_sched +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z INFO 49187 [post_scheduler]: Start PosT ScheD 3 sunda Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49187 (sg02) [RematOpt]: Removed 0 remat instructions +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: remat_optimization finished after 0.081 seconds +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 391mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9890 memory location(s), 1 block(s), and 51393 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: Running early_peephole_opts +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=9890 blocks=1 instructions=51393 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z INFO 49187 (sg02) [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-08-07T13:54:03Z INFO 49187 (sg02) [EarlyPeepholeOpts]: Activation Accumulate: 0 +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: early_peephole_opts finished after 0.015 seconds +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 391mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9890 memory location(s), 1 block(s), and 51393 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=9890 blocks=1 instructions=51393 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.004 seconds +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 391mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9890 memory location(s), 1 block(s), and 51393 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: Running infer_stream_ids +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=9890 blocks=1 instructions=51393 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: infer_stream_ids finished after 0.004 seconds +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 391mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9890 memory location(s), 1 block(s), and 51393 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: Running pre_sched +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=9890 blocks=1 instructions=51393 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: Start PRE scheduling 2 cores: 1 at: Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49187 [LayerSpiller]: LayerSpill: Start... +2025-08-07T13:54:03Z INFO 49187 [LayerSpiller]: LayerSpill: Found 1 Splits CCs +2025-08-07T13:54:03Z INFO 49187 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-08-07T13:54:03Z INFO 49187 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-08-07T13:54:03Z INFO 49187 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-08-07T13:54:03Z INFO 49187 [LayerSpiller]: LayerSpill: Done. +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: Start split live ranges Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: Num_Splits: 1 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: End split live ranges Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: Strt remove redundncies Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: remove_redundant_memsets +2025-08-07T13:54:03Z INFO 49187 [post_scheduler]: Time-aware hwm post-sched +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: remove_redundant_memsets: 0 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: remove_redundant_loads +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: remove_redundant_loads: 0 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: End remove redundncies Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: Start DCE Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:03Z INFO 49187 [post_scheduler]: Time-aware simulation time: 41190135 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49187 [post_scheduler]: Done PosT ScheD Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: post_sched finished after 0.145 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 396mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running expand_scheduling_units +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: expand_scheduling_units finished after 0.001 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 396mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: End DCE Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: Start build flow dependencies Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49187 (sg02) [build_flow_deps]: Start build fdeps. Invocation: 6Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49187 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 77 PSUM Banks +2025-08-07T13:54:03Z INFO 49187 (sg02) [build_flow_deps]: Allocs: 9892 instructions: 51395 +2025-08-07T13:54:03Z INFO 49187 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 66 PSUM Banks +2025-08-07T13:54:03Z INFO 49187 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 1 PSUM Banks +2025-08-07T13:54:03Z INFO 49187 (sg01) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-08-07T13:54:03Z INFO 49187 (sg01) [DMAOptimizationBase]: SB Rotation rotated 42 Sb address +2025-08-07T13:54:03Z INFO 49187 (sg01) [DMAOptimizationBase]: SB Rotation rotated 9 Sb address +2025-08-07T13:54:03Z INFO 49187 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:03Z INFO 49187 (sg01) [DMAOptimizationBase]: SB Rotation rotated 35 Sb address +2025-08-07T13:54:03Z INFO 49187 (sg01) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-08-07T13:54:03Z INFO 49187 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: address_rotation_sb finished after 0.061 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 396mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z INFO 49187 (sg01) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:03Z INFO 49187 (sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:03Z INFO 49187 (sg01) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.038 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 401mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z INFO 49187 (sg01) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:03Z INFO 49187 (sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-08-07T13:54:03Z INFO 49187 (sg01) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.005 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 401mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running dep_opt +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z INFO 49187 (sg01) [build_flow_deps]: Start build fdeps. Invocation: 7Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49187 (sg01) [build_flow_deps]: Allocs: 1085 instructions: 6213 +2025-08-07T13:54:03Z INFO 49187 (sg01) [build_flow_deps]: Build fdeps inserted 19593 edges +2025-08-07T13:54:03Z INFO 49187 (sg01) [build_flow_deps]: Done build fdeps 19593 Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: dep_opt finished after 0.016 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 402mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: Running report_stats +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z INFO 49187 (sg01) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 6291456 │ +│ DMACopy │ Internal -> ExternalOutput │ 16 │ 16777216 │ +│ DMACopy │ Internal -> Output │ 1 │ 4194304 │ +│ Load │ Const -> Internal │ 2 │ 40960 │ +│ Load │ ExternalInput -> Internal │ 204 │ 192954880 │ +│ Load │ Input -> Internal │ 3 │ 196608 │ +│ Load │ Internal │ 4 │ 4194304 │ +│ Save │ Internal │ 16 │ 4194304 │ +│ Save │ Internal -> Output │ 3 │ 2097154 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-08-07T13:54:03Z INFO 49187 (sg01) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 3 │ +│ 64 │ 3 │ +│ 256 │ 1 │ +│ 512 │ 3 │ +│ 2048 │ 16 │ +│ 6144 │ 64 │ +│ 8192 │ 142 │ +│ 262144 │ 16 │ +│ 2097152 │ 5 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:03Z INFO 49187 (sg01) [ReportStats]: MM Stats: #MatMults 5198 #MatMult-Transposes 276 +2025-08-07T13:54:03Z INFO 49187 (sg01) [ReportStats]: IO Tensor size combined: 197149188 +2025-08-07T13:54:03Z INFO 49187 (sg01) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input87 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input84 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input85 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input88 │ ExternalInput │ bfloat16 │ 16777216 │ +│ input94 │ ExternalInput │ bfloat16 │ 16777216 │ +│ input92 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input89 │ ExternalInput │ bfloat16 │ 4194304 │ +│ output4 │ ExternalOutput │ bfloat16 │ 1048576 │ +│ input7 │ ExternalInput │ bfloat16 │ 1048576 │ +│ input6 │ ExternalInput │ bfloat16 │ 1048576 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-08-07T13:54:03Z INFO 49187 (sg01) [ReportStats]: Large (Internal) Tensor Statistics: +┌──────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├──────────────────────┼──────────┼──────────┼──────────────┤ +│ input89_local_979 │ Internal │ bfloat16 │ 4194304 │ +│ input84_local_908_i6 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_908_i3 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_908_i2 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_908_i4 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_908_i5 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_908_i8 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_908_i7 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_908_i1 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_908_i0 │ Internal │ bfloat16 │ 3145728 │ +└──────────────────────┴──────────┴──────────┴──────────────┘ + +2025-08-07T13:54:03Z USER 49187 (sg01) [ModuleForkPass]: report_stats finished after 0.001 seconds +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 402mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:03Z INFO 49187 (sg02) [build_flow_deps]: Build fdeps inserted 180829 edges +2025-08-07T13:54:03Z INFO 49187 (sg02) [build_flow_deps]: Done build fdeps 180829 Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: End build flow dependencies Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: Start remove useless insts Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: remove_useless_insts +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: remove Useless Instructions: 0 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: End remove useless insts Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: Start scratchpad optimization Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: End scratchpad optimization Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PreSched]: DONE PRE scheduling Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: pre_sched finished after 0.367 seconds +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 403mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9892 memory location(s), 1 block(s), and 51395 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=9892 blocks=1 instructions=51395 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z INFO 49187 (sg02) [TensorCopyElim]: Tensor CP elimination: 1 +2025-08-07T13:54:03Z INFO 49187 (sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:03Z INFO 49187 (sg02) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49187 (sg02) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49187 (sg02) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.071 seconds +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 403mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9891 memory location(s), 1 block(s), and 51394 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: Running dynamic_dma_setup +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=9891 blocks=1 instructions=51394 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 403mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9892 memory location(s), 1 block(s), and 51394 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: Running runtime_memory_reservation +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=9892 blocks=1 instructions=51394 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 403mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9892 memory location(s), 1 block(s), and 51394 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: Running coloring_allocator_psum +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=9892 blocks=1 instructions=51394 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z INFO 49187 (sg02) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:03Z INFO 49187 (sg02) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: allocating PSUM +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: main loop +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: renumber locations +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: size = 6107 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: build_no_bitmap start +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: found 16810 edges +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: mean: 5.50516 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: median: 6.9956 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: adjacency vectors require 134480 bytes +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: build_no_bitmap done +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: find costs +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: simplify interference graph +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: initialize low and high +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: lo = 6107 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: hi = 0 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: inf = 0 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: total = 6107 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: simplify +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: new candidates = 0 +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: select ranges +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: no more spills +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-08-07T13:54:03Z INFO 49187 (sg02) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: coloring_allocator_psum finished after 0.175 seconds +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 405mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9892 memory location(s), 1 block(s), and 51394 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: Running dma_optimization_psum +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=9892 blocks=1 instructions=51394 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z INFO 49187 (sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:03Z INFO 49187 (sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: dma_optimization_psum finished after 0.039 seconds +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 405mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9892 memory location(s), 1 block(s), and 51394 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z USER 49187 (sg02) [ModuleForkPass]: Running address_rotation_psum +2025-08-07T13:54:03Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=9892 blocks=1 instructions=51394 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:03Z INFO 49187 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-08-07T13:54:03Z INFO 49187 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 1 PSUM Banks +2025-08-07T13:54:04Z INFO 49187 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 1 PSUM Banks +2025-08-07T13:54:04Z USER 49187 (sg02) [ModuleForkPass]: address_rotation_psum finished after 0.251 seconds +2025-08-07T13:54:04Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 405mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9892 memory location(s), 1 block(s), and 51394 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:04Z USER 49187 (sg02) [ModuleForkPass]: Running coloring_allocator_sb +2025-08-07T13:54:04Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=9892 blocks=1 instructions=51394 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:04Z INFO 49187 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 777890846 +2025-08-07T13:54:04Z INFO 49187 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 7947 bytes +2025-08-07T13:54:04Z INFO 49187 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 4511754 +2025-08-07T13:54:04Z INFO 49187 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 1986 bytes +2025-08-07T13:54:04Z INFO 49187 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 8196 +2025-08-07T13:54:04Z INFO 49187 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 248 bytes +2025-08-07T13:54:04Z INFO 49187 (sg02) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:04Z INFO 49187 (sg02) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: allocating SB +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: main loop +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: renumber locations +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: size = 3746 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: find partners +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: found 6103 accumulation groups +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: largest = _dot.256-t854_i7 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: tensors = 50 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: requires 73728 bytes/partition +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: expanding partners +2025-08-07T13:54:04Z INFO 49187 []: find first defs for local +2025-08-07T13:54:04Z INFO 49187 []: find first defs for global +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: find loads +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: 1 pin count +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: 712 remat count +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: build interference graph +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: pass 1 int-tree +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: Num intervals 3746 Num locations 3746 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: info.neighbors init Done +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: edge: 43261 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: mean: 23.0972 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: median: 16.7179 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: find costs +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: simplify interference graph +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: safe = 3412 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: unsafe = 155 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: inf = 178 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: total = 3745 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: simplify +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 150 #Pinned 0 #Safe 0 minCost 0.00525312 maxCost 1.24447 locations 3746 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: new candidates = 122 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: select ranges +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: Total: 3745 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: Spilled: 0.000 (0) +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: Allocated: 1.000 (3745) +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: Rover zone: 0.915 (3428) +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: Pre-rover zone: 0.009 (33) +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: Post-rover zone: 0.075 (280) +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: Slice zone: 0.001 (4) +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: Blocks nothing: 0.054 (202) +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: Blocks medium: 0.003 (10) +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.641 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: Visited until medium blocking (median): 0.698 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.731 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: Blocks tall: 0.943 (3533) +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.807 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: Visited until tall blocking (median): 0.999 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:54:04Z INFO 49187 (sg02) [SB_Allocator]: Success +2025-08-07T13:54:28Z INFO 49187 (sg02) [SB_Allocator]: SB spills = 0 tensors +2025-08-07T13:54:28Z INFO 49187 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:28Z INFO 49187 (sg02) [SB_Allocator]: remats = 0 tensors +2025-08-07T13:54:28Z INFO 49187 (sg02) [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:54:28Z INFO 49187 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:28Z INFO 49187 (sg02) [SB_Allocator]: SB score = 0 +2025-08-07T13:54:28Z INFO 49187 (sg02) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-08-07T13:54:28Z INFO 49187 (sg02) [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-08-07T13:54:28Z INFO 49187 (sg02) [SB_Allocator]: pinning saved approximately 9010 cycles +2025-08-07T13:54:28Z INFO 49187 (sg02) [SB_Allocator]: 0% SB utilization after allocation +2025-08-07T13:54:28Z INFO 49187 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 777890846 +2025-08-07T13:54:28Z INFO 49187 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 7947 bytes +2025-08-07T13:54:28Z INFO 49187 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 4511754 +2025-08-07T13:54:28Z INFO 49187 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 1986 bytes +2025-08-07T13:54:28Z INFO 49187 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 8196 +2025-08-07T13:54:28Z INFO 49187 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 248 bytes +2025-08-07T13:54:28Z USER 49187 (sg02) [ModuleForkPass]: coloring_allocator_sb finished after 24.728 seconds +2025-08-07T13:54:28Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 405mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:28Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9892 memory location(s), 1 block(s), and 51394 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:28Z USER 49187 (sg02) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:28Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=9892 blocks=1 instructions=51394 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:28Z INFO 49187 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:28Z USER 49187 (sg02) [ModuleForkPass]: address_rotation_sb finished after 0.064 seconds +2025-08-07T13:54:28Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 405mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:28Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9892 memory location(s), 1 block(s), and 51394 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:28Z USER 49187 (sg02) [ModuleForkPass]: Running dma_optimization_sb +2025-08-07T13:54:28Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=9892 blocks=1 instructions=51394 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:28Z INFO 49187 (sg02) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 782402600, 98.8464% input load, 5.11246e-07% output write, 1.15357% spill/reload [sg0002] +2025-08-07T13:54:28Z INFO 49187 (sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:28Z INFO 49187 (sg02) [DMAOptimizationBase]: removed 0 identical load +2025-08-07T13:54:28Z INFO 49187 (sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:28Z INFO 49187 (sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:28Z INFO 49187 (sg02) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-08-07T13:54:28Z INFO 49187 (sg02) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-08-07T13:54:28Z INFO 49187 (sg02) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-08-07T13:54:28Z INFO 49187 (sg02) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(7.73377e+08) +2025-08-07T13:54:28Z INFO 49187 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload instructions +2025-08-07T13:54:28Z INFO 49187 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload memory locations +2025-08-07T13:54:28Z INFO 49187 (sg02) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-08-07T13:54:28Z INFO 49187 (sg02) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-08-07T13:54:28Z INFO 49187 (sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 4100, 0.0454266% out of total spill/reload dma traffic +2025-08-07T13:54:28Z INFO 49187 (sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:28Z INFO 49187 (sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:28Z INFO 49187 (sg02) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 2 spill/reload instructions +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 1 spill/reload memory locations +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: average loaded DMA size 7958 bytes +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: average saved DMA size 2107 bytes +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 777888540 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 7958 bytes +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 4509448 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2107 bytes +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 512, 0.00567278% out of total spill/reload dma traffic +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 4612, 0.000589466% out of total dma traffic +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 782397988, 98.847% input load, 5.11249e-07% output write, 1.15299% spill/reload [sg0002] +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 777888540 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 7958 bytes +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 4509448 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2107 bytes +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 8196 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 248 bytes +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 7830 bytes +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: dma_optimization_sb finished after 0.301 seconds +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 411mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51387 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=9884 blocks=1 instructions=51387 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: SB Rotation rotated 303 Sb address +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: SB Rotation rotated 278 Sb address +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: SB Rotation rotated 172 Sb address +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: SB Rotation rotated 213 Sb address +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: address_rotation_sb finished after 0.273 seconds +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 411mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51387 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: Running coloring_allocator_dram +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=9884 blocks=1 instructions=51387 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z INFO 49187 (sg02) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:29Z INFO 49187 (sg02) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: reserved space = 777571354 bytes +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: spill space = 8707844 bytes +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: aligned spill space = 8749056 bytes +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: renumber locations +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: size = 18 +2025-08-07T13:54:29Z INFO 49187 []: find first defs for local +2025-08-07T13:54:29Z INFO 49187 []: find first defs for global +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: Num intervals 18 Num locations 18 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: simplify interference graph +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: initialize low and high +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: lo = 18 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: hi = 0 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: total = 18 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: simplify +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: select ranges +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: allreduce_dram_hwm 4210688 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: Real CC buffer size 4210688 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: DRAM hwm after allocation: 6307840 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: coloring_allocator_dram finished after 0.073 seconds +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 411mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51387 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: Running address_rotation_dram +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=9884 blocks=1 instructions=51387 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: Runtime page size at 512MB +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: DRAM hwm before rotation 6307840 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: allreduce hwm 4210688 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: Real CC buffer size 4210688 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: DRAM hwm after rotation 6307840 +2025-08-07T13:54:29Z INFO 49187 (sg02) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: address_rotation_dram finished after 0.034 seconds +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 411mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51387 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: Running tensorcopy_accel +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=9884 blocks=1 instructions=51387 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z INFO 49187 (sg02) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-08-07T13:54:29Z INFO 49187 (sg02) [TensorCopyAccel::Impl]: Accelerated 0 out of 6070 tensorcopy in Function: sg0002 average acceleration factor: -nan +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: tensorcopy_accel finished after 0.005 seconds +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 411mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51387 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: Running peephole_opts +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=9884 blocks=1 instructions=51387 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z INFO 49187 (sg02) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: peephole_opts finished after 0.016 seconds +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 411mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: Running lower_kernel +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z INFO 49187 (sg02) [LowerKernel]: Started running LowerKernel +2025-08-07T13:54:29Z INFO 49187 (sg02) [LowerKernel]: Start of kernel lowering pass, number of insts: 51390, number of allocs: 9884 +2025-08-07T13:54:29Z INFO 49187 (sg02) [LowerKernel]: Scan BKs time (s): 0.003281 +2025-08-07T13:54:29Z INFO 49187 (sg02) [LowerKernel]: Lower BKs time (s): 4e-06 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: lower_kernel finished after 0.004 seconds +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 411mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.004 seconds +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 411mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.005 seconds +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 411mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: birverifier finished after 0.039 seconds +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 411mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: Running dynamic_dma_scan +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: dynamic_dma_scan finished after 0.005 seconds +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 411mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: Running build_fdeps +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z INFO 49187 (sg02) [build_flow_deps]: Start build fdeps. Invocation: 8Thu Aug 7 13:54:29 2025 +2025-08-07T13:54:29Z INFO 49187 (sg02) [build_flow_deps]: Allocs: 9884 instructions: 51390 +2025-08-07T13:54:29Z INFO 49187 (sg02) [build_flow_deps]: Build fdeps inserted 180824 edges +2025-08-07T13:54:29Z INFO 49187 (sg02) [build_flow_deps]: Done build fdeps 180824 Thu Aug 7 13:54:29 2025 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: build_fdeps finished after 0.134 seconds +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 411mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: Running remove_redundancies +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z INFO 49187 (sg02) [RemoveRedundancies]: remove_clobbered_writes +2025-08-07T13:54:29Z INFO 49187 (sg02) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-08-07T13:54:29Z INFO 49187 (sg02) [RemoveRedundancies]: remove_useless_insts +2025-08-07T13:54:29Z INFO 49187 (sg02) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: remove_redundancies finished after 0.019 seconds +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 411mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z INFO 49187 (sg02) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:29Z INFO 49187 (sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:29Z INFO 49187 (sg02) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.237 seconds +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 441mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:29Z USER 49187 (sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:29Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:30Z INFO 49187 (sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:30Z INFO 49187 (sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:30Z USER 49187 (sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.052 seconds +2025-08-07T13:54:30Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 441mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:30Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:30Z USER 49187 (sg02) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-08-07T13:54:30Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:30Z USER 49187 (sg02) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-08-07T13:54:30Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 441mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:30Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:30Z USER 49187 (sg02) [ModuleForkPass]: Running post_sched +2025-08-07T13:54:30Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:30Z INFO 49187 [post_scheduler]: Start PosT ScheD 3 sunda Thu Aug 7 13:54:30 2025 +2025-08-07T13:54:30Z INFO 49187 [post_scheduler]: Time-aware hwm post-sched +2025-08-07T13:54:31Z INFO 49187 [post_scheduler]: Time-aware simulation time: 5749922 +2025-08-07T13:54:31Z INFO 49187 [post_scheduler]: Done PosT ScheD Thu Aug 7 13:54:31 2025 +2025-08-07T13:54:31Z USER 49187 (sg02) [ModuleForkPass]: post_sched finished after 1.537 seconds +2025-08-07T13:54:31Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 481mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:31Z USER 49187 (sg02) [ModuleForkPass]: Running expand_scheduling_units +2025-08-07T13:54:31Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:31Z USER 49187 (sg02) [ModuleForkPass]: expand_scheduling_units finished after 0.006 seconds +2025-08-07T13:54:31Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 481mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:31Z USER 49187 (sg02) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:31Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:32Z INFO 49187 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 3640 PSUM Banks +2025-08-07T13:54:32Z INFO 49187 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 4473 PSUM Banks +2025-08-07T13:54:32Z INFO 49187 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 1 PSUM Banks +2025-08-07T13:54:32Z INFO 49187 (sg02) [DMAOptimizationBase]: SB Rotation rotated 9 Sb address +2025-08-07T13:54:32Z INFO 49187 (sg02) [DMAOptimizationBase]: SB Rotation rotated 42 Sb address +2025-08-07T13:54:32Z INFO 49187 (sg02) [DMAOptimizationBase]: SB Rotation rotated 39 Sb address +2025-08-07T13:54:32Z INFO 49187 (sg02) [DMAOptimizationBase]: SB Rotation rotated 3 Sb address +2025-08-07T13:54:32Z INFO 49187 (sg02) [DMAOptimizationBase]: SB Rotation rotated 120 Sb address +2025-08-07T13:54:32Z INFO 49187 (sg02) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-08-07T13:54:32Z INFO 49187 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:32Z USER 49187 (sg02) [ModuleForkPass]: address_rotation_sb finished after 1.067 seconds +2025-08-07T13:54:32Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 481mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:32Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:32Z USER 49187 (sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:32Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:32Z INFO 49187 (sg02) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:32Z INFO 49187 (sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:32Z INFO 49187 (sg02) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:32Z USER 49187 (sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.182 seconds +2025-08-07T13:54:32Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 488mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:32Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:32Z USER 49187 (sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:32Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:32Z INFO 49187 (sg02) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:32Z INFO 49187 (sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-08-07T13:54:32Z INFO 49187 (sg02) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:32Z USER 49187 (sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.033 seconds +2025-08-07T13:54:32Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 455mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:32Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:32Z USER 49187 (sg02) [ModuleForkPass]: Running dep_opt +2025-08-07T13:54:32Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:32Z INFO 49187 (sg02) [build_flow_deps]: Start build fdeps. Invocation: 9Thu Aug 7 13:54:32 2025 +2025-08-07T13:54:32Z INFO 49187 (sg02) [build_flow_deps]: Allocs: 9884 instructions: 51390 +2025-08-07T13:54:33Z INFO 49187 (sg02) [build_flow_deps]: Build fdeps inserted 177220 edges +2025-08-07T13:54:33Z INFO 49187 (sg02) [build_flow_deps]: Done build fdeps 177220 Thu Aug 7 13:54:33 2025 +2025-08-07T13:54:33Z USER 49187 (sg02) [ModuleForkPass]: dep_opt finished after 0.259 seconds +2025-08-07T13:54:33Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 (sg02) [ModuleForkPass]: Running report_stats +2025-08-07T13:54:33Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z INFO 49187 (sg02) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 6291456 │ +│ DMACopy │ Internal │ 1 │ 2097152 │ +│ Load │ Const -> Internal │ 4 │ 34824 │ +│ Load │ ExternalInput -> Internal │ 760 │ 773342220 │ +│ Load │ Internal │ 21 │ 4511496 │ +│ Save │ Internal │ 615 │ 4509444 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-08-07T13:54:33Z INFO 49187 (sg02) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 64 │ 2 │ +│ 256 │ 2 │ +│ 512 │ 593 │ +│ 1024 │ 15 │ +│ 2048 │ 10 │ +│ 6144 │ 64 │ +│ 8192 │ 696 │ +│ 60768 │ 1 │ +│ 60776 │ 4 │ +│ 2097152 │ 3 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:33Z INFO 49187 (sg02) [ReportStats]: MM Stats: #MatMults 42707 #MatMult-Transposes 19795 +2025-08-07T13:54:33Z INFO 49187 (sg02) [ReportStats]: IO Tensor size combined: 773342224 +2025-08-07T13:54:33Z INFO 49187 (sg02) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input473 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input469 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input472 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input470 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input474 │ ExternalInput │ bfloat16 │ 8192 │ +│ input471 │ ExternalInput │ bfloat16 │ 8192 │ +│ input1 │ ExternalInput │ int32 │ 1024 │ +│ input3 │ ExternalInput │ float32 │ 12 │ +│ output0 │ ExternalOutput │ int32 │ 4 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-08-07T13:54:33Z INFO 49187 (sg02) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────┼──────────┼──────────┼──────────────┤ +│ input469_local_769_i1 │ Internal │ bfloat16 │ 3145728 │ +│ input469_local_769_i0 │ Internal │ bfloat16 │ 3145728 │ +│ input469_local_769_i7 │ Internal │ bfloat16 │ 3145728 │ +│ input469_local_769_i5 │ Internal │ bfloat16 │ 3145728 │ +│ input469_local_769_i3 │ Internal │ bfloat16 │ 3145728 │ +│ input469_local_769_i6 │ Internal │ bfloat16 │ 3145728 │ +│ input469_local_769_i9 │ Internal │ bfloat16 │ 3145728 │ +│ input469_local_769_i8 │ Internal │ bfloat16 │ 3145728 │ +│ input469_local_769_i4 │ Internal │ bfloat16 │ 3145728 │ +│ input469_local_769_i2 │ Internal │ bfloat16 │ 3145728 │ +└───────────────────────┴──────────┴──────────┴──────────────┘ + +2025-08-07T13:54:33Z USER 49187 (sg02) [ModuleForkPass]: report_stats finished after 0.011 seconds +2025-08-07T13:54:33Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:33Z USER 49187 [BackendPassManager]: mod_parallel_pass finished after 30.341 seconds +2025-08-07T13:54:33Z INFO 49187 [BackendPassManager]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 [BackendPassManager]: Output has 3 module(s), 3 function(s), 11844 memory location(s), 3 block(s), and 59746 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 [BackendPassManager]: Running assign_trigger_engine +2025-08-07T13:54:33Z INFO 49187 [BackendPassManager]: Inputs to assign_trigger_engine: modules=3 functions=3 allocs=11844 blocks=3 instructions=59746 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z INFO 49187 (sg00) [AssignTriggerEngine]: Assigned trigger engine for 65 DMA instructions. Moved 41 DMA instructions to CC's engines. +2025-08-07T13:54:33Z INFO 49187 (sg01) [AssignTriggerEngine]: Assigned trigger engine for 19 DMA instructions. Moved 3 DMA instructions to CC's engines. +2025-08-07T13:54:33Z INFO 49187 (sg02) [AssignTriggerEngine]: Assigned trigger engine for 620 DMA instructions. Moved 5 DMA instructions to CC's engines. +2025-08-07T13:54:33Z USER 49187 [BackendPassManager]: assign_trigger_engine finished after 0.034 seconds +2025-08-07T13:54:33Z INFO 49187 [BackendPassManager]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 [BackendPassManager]: Output has 3 module(s), 3 function(s), 11844 memory location(s), 3 block(s), and 59746 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:33Z INFO 49187 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=11844 blocks=3 instructions=59746 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 (sg00) [SubgraphForkPass]: Running lower_local_collectives +2025-08-07T13:54:33Z INFO 49187 (sg00) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:33Z USER 49187 (sg00) [SubgraphForkPass]: lower_local_collectives finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49187 (sg00) [SubgraphForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:33Z USER 49187 (sg00) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-08-07T13:54:33Z INFO 49187 (sg00) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:33Z USER 49187 (sg00) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49187 (sg00) [SubgraphForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:33Z USER 49187 (sg00) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:33Z INFO 49187 (sg00) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:33Z USER 49187 (sg02) [SubgraphForkPass]: Running lower_local_collectives +2025-08-07T13:54:33Z USER 49187 (sg01) [SubgraphForkPass]: Running lower_local_collectives +2025-08-07T13:54:33Z INFO 49187 (sg01) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:33Z USER 49187 (sg01) [SubgraphForkPass]: lower_local_collectives finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49187 (sg01) [SubgraphForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:33Z USER 49187 (sg01) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-08-07T13:54:33Z INFO 49187 (sg01) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:33Z USER 49187 (sg01) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49187 (sg01) [SubgraphForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:33Z USER 49187 (sg01) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:33Z INFO 49187 (sg01) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:33Z INFO 49187 (sg02) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 (sg02) [SubgraphForkPass]: lower_local_collectives finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49187 (sg02) [SubgraphForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 (sg02) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-08-07T13:54:33Z INFO 49187 (sg02) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 (sg02) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49187 (sg02) [SubgraphForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 (sg02) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:33Z INFO 49187 (sg02) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z INFO 49187 (sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:33Z USER 49187 (sg00) [SubgraphForkPass]: dead_code_elim finished after 0.006 seconds +2025-08-07T13:54:33Z INFO 49187 (sg00) [SubgraphForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:33Z INFO 49187 (sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:33Z USER 49187 (sg01) [SubgraphForkPass]: dead_code_elim finished after 0.011 seconds +2025-08-07T13:54:33Z INFO 49187 (sg01) [SubgraphForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:33Z INFO 49187 (sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:33Z USER 49187 (sg02) [SubgraphForkPass]: dead_code_elim finished after 0.033 seconds +2025-08-07T13:54:33Z INFO 49187 (sg02) [SubgraphForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:33Z USER 49187 [BackendPassManager]: subgraph_parallel_pass finished after 0.036 seconds +2025-08-07T13:54:33Z INFO 49187 [BackendPassManager]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 [BackendPassManager]: Output has 3 module(s), 3 function(s), 11844 memory location(s), 3 block(s), and 59746 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 [BackendPassManager]: Running assign_hwdge_engine +2025-08-07T13:54:33Z INFO 49187 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=3 functions=3 allocs=11844 blocks=3 instructions=59746 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 [BackendPassManager]: assign_hwdge_engine finished after 0.006 seconds +2025-08-07T13:54:33Z INFO 49187 [BackendPassManager]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 [BackendPassManager]: Output has 3 module(s), 3 function(s), 11844 memory location(s), 3 block(s), and 59746 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:33Z INFO 49187 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=11844 blocks=3 instructions=59746 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 (sg00) [ModuleForkPass]: Running alloc_queues +2025-08-07T13:54:33Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:33Z INFO 49187 (sg00) [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-08-07T13:54:33Z USER 49187 (sg02) [ModuleForkPass]: Running alloc_queues +2025-08-07T13:54:33Z USER 49187 (sg01) [ModuleForkPass]: Running alloc_queues +2025-08-07T13:54:33Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:33Z INFO 49187 (sg01) [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-08-07T13:54:33Z INFO 49187 (sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 2 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 4 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 24 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 40 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 73 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-08-07T13:54:33Z USER 49187 (sg00) [ModuleForkPass]: alloc_queues finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:33Z USER 49187 (sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-08-07T13:54:33Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:33Z USER 49187 (sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:33Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 (sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-08-07T13:54:33Z INFO 49187 (sg02) [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-08-07T13:54:33Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:33Z USER 49187 (sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:33Z USER 49187 (sg00) [ModuleForkPass]: Running lower_control +2025-08-07T13:54:33Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:33Z INFO 49187 (sg01) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 1 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 4 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 16 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 2 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 226 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-08-07T13:54:33Z USER 49187 (sg01) [ModuleForkPass]: alloc_queues finished after 0.001 seconds +2025-08-07T13:54:33Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:33Z USER 49187 (sg01) [ModuleForkPass]: Running chain_dma_transposes +2025-08-07T13:54:33Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:33Z USER 49187 (sg01) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:33Z USER 49187 (sg01) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-08-07T13:54:33Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:33Z USER 49187 (sg01) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:33Z USER 49187 (sg01) [ModuleForkPass]: Running lower_control +2025-08-07T13:54:33Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:33Z INFO 49187 (sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-08-07T13:54:33Z USER 49187 (sg00) [ModuleForkPass]: lower_control finished after 0.002 seconds +2025-08-07T13:54:33Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:33Z USER 49187 (sg00) [ModuleForkPass]: Running dep_reduction +2025-08-07T13:54:33Z INFO 49187 (sg00) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=875 blocks=1 instructions=2143 Max writers: 32 Max Readers: 160 +2025-08-07T13:54:33Z INFO 49187 (sg00) [DepReduction]: Start Dependency Reduction +2025-08-07T13:54:33Z INFO 49187 (sg00) [DepReduction]: Processing async instrs... +2025-08-07T13:54:33Z INFO 49187 (sg00) [DepReduction]: Processing secondary edges per engine... +2025-08-07T13:54:33Z INFO 49187 (sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 1904 +2025-08-07T13:54:33Z INFO 49187 (sg02) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 5 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 20 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 608 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 8 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 4 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 757 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-08-07T13:54:33Z USER 49187 (sg02) [ModuleForkPass]: alloc_queues finished after 0.005 seconds +2025-08-07T13:54:33Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 2006 +2025-08-07T13:54:33Z INFO 49187 (sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 2006 +2025-08-07T13:54:33Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 (sg02) [ModuleForkPass]: Running chain_dma_transposes +2025-08-07T13:54:33Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 (sg02) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 (sg02) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-08-07T13:54:33Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 (sg02) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-08-07T13:54:33Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 (sg02) [ModuleForkPass]: Running lower_control +2025-08-07T13:54:33Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z INFO 49187 (sg01) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-08-07T13:54:33Z USER 49187 (sg01) [ModuleForkPass]: lower_control finished after 0.008 seconds +2025-08-07T13:54:33Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:33Z USER 49187 (sg01) [ModuleForkPass]: Running dep_reduction +2025-08-07T13:54:33Z INFO 49187 (sg01) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=1085 blocks=1 instructions=6213 Max writers: 48 Max Readers: 276 +2025-08-07T13:54:33Z INFO 49187 (sg01) [DepReduction]: Start Dependency Reduction +2025-08-07T13:54:33Z INFO 49187 (sg01) [DepReduction]: Processing async instrs... +2025-08-07T13:54:33Z INFO 49187 (sg01) [DepReduction]: Processing secondary edges per engine... +2025-08-07T13:54:33Z INFO 49187 (sg00) [DepReduction]: Num Async removed: 0 +2025-08-07T13:54:33Z INFO 49187 (sg00) [DepReduction]: Finished dependency reduction: 12805 removed, new total 851 +2025-08-07T13:54:33Z INFO 49187 (sg00) [DepReduction]: Finished Dependency Reduction +2025-08-07T13:54:33Z USER 49187 (sg00) [ModuleForkPass]: dep_reduction finished after 0.014 seconds +2025-08-07T13:54:33Z INFO 49187 (sg00) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 875 memory location(s), 1 block(s), and 2143 instruction(s). Max writers: 32 Max Readers: 160 +2025-08-07T13:54:33Z INFO 49187 (sg01) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 8020 +2025-08-07T13:54:33Z INFO 49187 (sg01) [DepReduction]: Processing redundant descendants, Done. Num edges removed 8232 +2025-08-07T13:54:33Z INFO 49187 (sg01) [DepReduction]: Processing async instrs, Done. Num edges removed 8232 +2025-08-07T13:54:33Z INFO 49187 (sg02) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-08-07T13:54:33Z INFO 49187 (sg01) [DepReduction]: Num Async removed: 0 +2025-08-07T13:54:33Z INFO 49187 (sg01) [DepReduction]: Finished dependency reduction: 43619 removed, new total 1265 +2025-08-07T13:54:33Z INFO 49187 (sg01) [DepReduction]: Finished Dependency Reduction +2025-08-07T13:54:33Z USER 49187 (sg01) [ModuleForkPass]: dep_reduction finished after 0.047 seconds +2025-08-07T13:54:33Z INFO 49187 (sg01) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1085 memory location(s), 1 block(s), and 6213 instruction(s). Max writers: 48 Max Readers: 276 +2025-08-07T13:54:33Z USER 49187 (sg02) [ModuleForkPass]: lower_control finished after 0.059 seconds +2025-08-07T13:54:33Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z USER 49187 (sg02) [ModuleForkPass]: Running dep_reduction +2025-08-07T13:54:33Z INFO 49187 (sg02) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=9884 blocks=1 instructions=51390 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:33Z INFO 49187 (sg02) [DepReduction]: Start Dependency Reduction +2025-08-07T13:54:33Z INFO 49187 (sg02) [DepReduction]: Processing async instrs... +2025-08-07T13:54:33Z INFO 49187 (sg02) [DepReduction]: Processing secondary edges per engine... +2025-08-07T13:54:33Z INFO 49187 (sg02) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 46606 +2025-08-07T13:54:33Z INFO 49187 (sg02) [DepReduction]: Processing redundant descendants, Done. Num edges removed 48013 +2025-08-07T13:54:33Z INFO 49187 (sg02) [DepReduction]: Processing async instrs, Done. Num edges removed 48013 +2025-08-07T13:54:34Z INFO 49187 (sg02) [DepReduction]: Num Async removed: 0 +2025-08-07T13:54:34Z INFO 49187 (sg02) [DepReduction]: Finished dependency reduction: 376907 removed, new total 15011 +2025-08-07T13:54:34Z INFO 49187 (sg02) [DepReduction]: Finished Dependency Reduction +2025-08-07T13:54:34Z USER 49187 (sg02) [ModuleForkPass]: dep_reduction finished after 0.709 seconds +2025-08-07T13:54:34Z INFO 49187 (sg02) [ModuleForkPass]: curr_vmrss: 484mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49187 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 9884 memory location(s), 1 block(s), and 51390 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z USER 49187 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:34Z USER 49187 [BackendPassManager]: mod_parallel_pass finished after 0.812 seconds +2025-08-07T13:54:34Z INFO 49187 [BackendPassManager]: curr_vmrss: 484mb, ru_maxrss: 538mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49187 [BackendPassManager]: Output has 3 module(s), 3 function(s), 11844 memory location(s), 3 block(s), and 59746 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z USER 49187 [BackendPassManager]: Running nc_parallel_pass +2025-08-07T13:54:34Z INFO 49187 [BackendPassManager]: Inputs to nc_parallel_pass: modules=3 functions=3 allocs=11844 blocks=3 instructions=59746 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z USER 49187 [CoreForkPass]: Running bir_linker +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: Inputs to bir_linker: modules=3 functions=3 allocs=11844 blocks=3 instructions=59746 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [BirLinker]: bir_linker cwd: +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [BirLinker]: Num intermediates 111 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [BirLinker]: Num Module Definitions 3 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [BirLinker]: Linking to a call-graph structure +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [BirLinker]: Added a new SpillReload Que qPoolPIOParam0 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [BirLinker]: tensor_map verification successful. +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [BirLinker]: Writing updated tensor_map /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/neuronxcc-k7bxqych/sgLnk/sg00/tensor_map.json +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [BirLinker]: PostLink Stats: #MatMults 225853 #MatMult-Transposes 29619 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [BirLinker]: Total Intermediate MMTs 2380 #out: 2240 #inp: 140 #symmetric: 0 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [BirLinker]: Total Intermediate IOs with MMTs: 37 #out: 35 #inp: 2 #both: 0 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [BirLinker]: releasing pre-link modules +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [BirLinker]: linking Done. +2025-08-07T13:54:34Z USER 49187 [CoreForkPass]: bir_linker finished after 0.722 seconds +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: curr_vmrss: 753mb, ru_maxrss: 753mb (delta=215mb) +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 59806 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z USER 49187 [CoreForkPass]: Running postlnk_dma_report +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: Inputs to postlnk_dma_report: modules=1 functions=4 allocs=12503 blocks=4 instructions=59806 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DMAReport]: DMA Report: Bytes loaded or saved 1037705256, 97.1918% input load, 0.423137% output write, 2.38503% spill/reload +2025-08-07T13:54:34Z USER 49187 [CoreForkPass]: postlnk_dma_report finished after 0.007 seconds +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: curr_vmrss: 419mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 59806 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z USER 49187 [CoreForkPass]: Running report_stats +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: Inputs to report_stats: modules=1 functions=4 allocs=12503 blocks=4 instructions=59806 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ReportStats]: Data Movement Statistics: main +┌─────────────┬──────┬───────┬───────┐ +│ Instruction │ Kind │ Count │ Bytes │ +└─────────────┴──────┴───────┴───────┘ + +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 2 │ 1244659712 │ +│ DMACopy │ Internal -> ExternalOutput │ 16 │ 16777216 │ +│ DMACopy │ Internal -> Output │ 1 │ 4194304 │ +│ Load │ Const -> Internal │ 4 │ 41472 │ +│ Load │ ExternalInput -> Internal │ 45 │ 41953792 │ +│ Load │ Internal │ 40 │ 4194304 │ +│ Save │ Internal │ 24 │ 3145728 │ +│ Save │ Internal -> Output │ 12 │ 2293762 │ +└─────────────┴────────────────────────────┴───────┴────────────┘ + +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 64 │ 2 │ +│ 256 │ 2 │ +│ 512 │ 52 │ +│ 1024 │ 1 │ +│ 2048 │ 8 │ +│ 4096 │ 2 │ +│ 8192 │ 56 │ +│ 262144 │ 16 │ +│ 2097152 │ 2 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 6291456 │ +│ DMACopy │ Internal -> ExternalOutput │ 16 │ 16777216 │ +│ DMACopy │ Internal -> Output │ 1 │ 4194304 │ +│ Load │ Const -> Internal │ 2 │ 40960 │ +│ Load │ ExternalInput -> Internal │ 204 │ 192954880 │ +│ Load │ Input -> Internal │ 3 │ 196608 │ +│ Load │ Internal │ 4 │ 4194304 │ +│ Save │ Internal │ 16 │ 4194304 │ +│ Save │ Internal -> Output │ 3 │ 2097154 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 3 │ +│ 64 │ 3 │ +│ 256 │ 1 │ +│ 512 │ 3 │ +│ 2048 │ 16 │ +│ 6144 │ 64 │ +│ 8192 │ 142 │ +│ 262144 │ 16 │ +│ 2097152 │ 5 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 6291456 │ +│ DMACopy │ Internal │ 1 │ 2097152 │ +│ Load │ Const -> Internal │ 4 │ 34824 │ +│ Load │ ExternalInput -> Internal │ 760 │ 773342220 │ +│ Load │ Internal │ 21 │ 4511496 │ +│ Save │ Internal │ 615 │ 4509444 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 64 │ 2 │ +│ 256 │ 2 │ +│ 512 │ 593 │ +│ 1024 │ 15 │ +│ 2048 │ 10 │ +│ 6144 │ 64 │ +│ 8192 │ 696 │ +│ 60768 │ 1 │ +│ 60776 │ 4 │ +│ 2097152 │ 3 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ReportStats]: MM Stats: #MatMults 49121 #MatMult-Transposes 20235 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ReportStats]: IO Tensor size combined: 9981009964 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input76_sg0000 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input473_sg0002 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input76 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input473 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input131 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input109 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input98 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input153 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input87 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input175 │ ExternalInput │ bfloat16 │ 50331648 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────────────────┼──────────┼──────────┼──────────────┤ +│ input89_local_979_sg0001 │ Internal │ bfloat16 │ 4194304 │ +│ input78_local_1233_sg0000 │ Internal │ bfloat16 │ 4194304 │ +│ input84_local_908_i2_sg0001 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_908_i5_sg0001 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_908_i1_sg0001 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_908_i3_sg0001 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_908_i4_sg0001 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_908_i7_sg0001 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_908_i6_sg0001 │ Internal │ bfloat16 │ 3145728 │ +│ input84_local_908_i0_sg0001 │ Internal │ bfloat16 │ 3145728 │ +└─────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-08-07T13:54:34Z USER 49187 [CoreForkPass]: report_stats finished after 0.014 seconds +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: curr_vmrss: 419mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 59806 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z USER 49187 [CoreForkPass]: Running coloring_allocator_dram_post_lnk +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: Inputs to coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=12503 blocks=4 instructions=59806 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Local +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: reserved space = 8342040596 bytes +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: spill space = 151191624 bytes +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: aligned spill space = 151339008 bytes +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: renumber locations +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: size = 111 +2025-08-07T13:54:34Z INFO 49187 []: find first defs for local +2025-08-07T13:54:34Z INFO 49187 []: find first defs for global +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: Num intervals 111 Num locations 111 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: simplify interference graph +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: initialize low and high +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: lo = 111 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: hi = 0 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: total = 111 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: simplify +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 10485760 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: select ranges +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 10485760 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: Real CC buffer size 10485760 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 21180416 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:34Z USER 49187 [CoreForkPass]: coloring_allocator_dram_post_lnk finished after 0.041 seconds +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: curr_vmrss: 419mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 59806 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z USER 49187 [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_post_lnk +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=12503 blocks=4 instructions=59806 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z USER 49187 [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_post_lnk finished after 0.027 seconds +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: curr_vmrss: 420mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 59806 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z USER 49187 [CoreForkPass]: Running lower_dynamic_dma +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=4 allocs=12503 blocks=4 instructions=59806 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z USER 49187 [CoreForkPass]: lower_dynamic_dma finished after 0.007 seconds +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: curr_vmrss: 420mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 59806 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z USER 49187 [CoreForkPass]: Running legalize_dynamic_dma +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=4 allocs=12503 blocks=4 instructions=59806 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-08-07T13:54:34Z USER 49187 [CoreForkPass]: legalize_dynamic_dma finished after 0.020 seconds +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: curr_vmrss: 420mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 59806 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z USER 49187 [CoreForkPass]: Running lower_dma +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: Inputs to lower_dma: modules=1 functions=4 allocs=12503 blocks=4 instructions=59806 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z INFO 49187 (sgLnk) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 7971/7971 (100% DGE) + power-of-2 partition : 8016/8058 (99.4788% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 8016/8058 (99.4788% DGE) + Cast (DGE/DMA) + 128 partition : 145/145 (100% DGE) + power-of-2 partition : 145/146 (99.3151% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 145/146 (99.3151% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 0/849 (0% DGE) + power-of-2 partition : 0/1478 (0% DGE) + > 3 dimensional : 0/8 (0% DGE) + non-integer desc size : 0/0 + total : 0/1478 (0% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 36 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 578/578 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-08-07T13:54:34Z USER 49187 [CoreForkPass]: lower_dma finished after 0.053 seconds +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: curr_vmrss: 420mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 59806 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z USER 49187 [CoreForkPass]: Running expand_all_engine +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=4 allocs=12503 blocks=4 instructions=59806 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z USER 49187 [CoreForkPass]: expand_all_engine finished after 0.009 seconds +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: curr_vmrss: 420mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 59806 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:34Z USER 49187 [CoreForkPass]: Running alloc_semaphores +2025-08-07T13:54:34Z INFO 49187 [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=4 allocs=12503 blocks=4 instructions=59806 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [CoreForkPass]: alloc_semaphores finished after 0.043 seconds +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: curr_vmrss: 420mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 59806 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [CoreForkPass]: Running expand_inst_late +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=4 allocs=12503 blocks=4 instructions=59806 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [CoreForkPass]: expand_inst_late finished after 0.041 seconds +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: curr_vmrss: 420mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 59859 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [CoreForkPass]: Running seq_inst_opt +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=4 allocs=12503 blocks=4 instructions=59859 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [SeqInstOpt]: Removing 31 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [SeqInstOpt]: Removing 15 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:54:35Z USER 49187 [CoreForkPass]: seq_inst_opt finished after 0.006 seconds +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: curr_vmrss: 420mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 59813 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [CoreForkPass]: Running lower_sync +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: Inputs to lower_sync: modules=1 functions=4 allocs=12503 blocks=4 instructions=59813 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [CoreForkPass]: lower_sync finished after 0.017 seconds +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: curr_vmrss: 420mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 61618 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [CoreForkPass]: Running lower_act +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: Inputs to lower_act: modules=1 functions=4 allocs=12503 blocks=4 instructions=61618 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [CoreForkPass]: lower_act finished after 0.007 seconds +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: curr_vmrss: 420mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 61633 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [CoreForkPass]: Running lower_dve +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: Inputs to lower_dve: modules=1 functions=4 allocs=12503 blocks=4 instructions=61633 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json +2025-08-07T13:54:35Z USER 49187 [CoreForkPass]: lower_dve finished after 0.064 seconds +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: curr_vmrss: 424mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 61633 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [CoreForkPass]: Running lower_ap +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: Inputs to lower_ap: modules=1 functions=4 allocs=12503 blocks=4 instructions=61633 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [CoreForkPass]: lower_ap finished after 0.010 seconds +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: curr_vmrss: 424mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 61633 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [CoreForkPass]: Running coloring_allocator_reg +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=4 allocs=12503 blocks=4 instructions=61633 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: allocating REG +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: allocating REG +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: renumber registers +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: size = 3 +2025-08-07T13:54:35Z INFO 49187 []: find first defs for local reg +2025-08-07T13:54:35Z INFO 49187 []: find first defs for global reg +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: live range analysis +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: find costs +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: simplify interference graph +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: initialize low and high +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: lo = 3 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: hi = 0 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: inf = 0 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: total = 3 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: simplify +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: new candidates = 0 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: select ranges +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: no more spills +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: allocating REG +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: renumber registers +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: size = 1 +2025-08-07T13:54:35Z INFO 49187 []: find first defs for local reg +2025-08-07T13:54:35Z INFO 49187 []: find first defs for global reg +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: live range analysis +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: find costs +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: simplify interference graph +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: initialize low and high +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: lo = 1 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: hi = 0 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: inf = 0 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: total = 1 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: simplify +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: new candidates = 0 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: select ranges +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: no more spills +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: allocating REG +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: renumber registers +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: size = 4 +2025-08-07T13:54:35Z INFO 49187 []: find first defs for local reg +2025-08-07T13:54:35Z INFO 49187 []: find first defs for global reg +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: live range analysis +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: find costs +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: simplify interference graph +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: initialize low and high +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: lo = 4 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: hi = 0 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: inf = 0 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: total = 4 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: simplify +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: new candidates = 0 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: select ranges +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: no more spills +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-08-07T13:54:35Z USER 49187 [CoreForkPass]: coloring_allocator_reg finished after 0.074 seconds +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: curr_vmrss: 427mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49187 [CoreForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 61633 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [BackendPassManager]: nc_parallel_pass finished after 1.207 seconds +2025-08-07T13:54:35Z INFO 49187 [BackendPassManager]: curr_vmrss: 427mb, ru_maxrss: 753mb (delta=215mb) +2025-08-07T13:54:35Z INFO 49187 [BackendPassManager]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 61633 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:35Z INFO 49187 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=4 allocs=12503 blocks=4 instructions=61633 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [ModuleForkPass]: Running birverifier +2025-08-07T13:54:35Z INFO 49187 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=4 allocs=12503 blocks=4 instructions=61633 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [ModuleForkPass]: birverifier finished after 0.058 seconds +2025-08-07T13:54:35Z INFO 49187 [ModuleForkPass]: curr_vmrss: 428mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49187 [ModuleForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 61633 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [BackendPassManager]: mod_parallel_pass finished after 0.060 seconds +2025-08-07T13:54:35Z INFO 49187 [BackendPassManager]: curr_vmrss: 428mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49187 [BackendPassManager]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 61633 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:35Z INFO 49187 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=4 allocs=12503 blocks=4 instructions=61633 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:35Z INFO 49187 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=4 allocs=12503 blocks=4 instructions=61633 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:35Z INFO 49187 [SubgraphForkPass]: curr_vmrss: 428mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49187 [SubgraphForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 61633 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [BackendPassManager]: subgraph_parallel_pass finished after 0.001 seconds +2025-08-07T13:54:35Z INFO 49187 [BackendPassManager]: curr_vmrss: 428mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49187 [BackendPassManager]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 61633 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:35Z INFO 49187 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=4 allocs=12503 blocks=4 instructions=61633 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [ModuleForkPass]: Running codegen +2025-08-07T13:54:35Z INFO 49187 [ModuleForkPass]: Inputs to codegen: modules=1 functions=4 allocs=12503 blocks=4 instructions=61633 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [Codegen]: Total compiler allocated DRAM tensors: 0.0197258 GB +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 7.62851 │ +│ ExternalOutput │ 0.0703125 │ +│ Const │ 0.000124224 │ +└────────────────┴─────────────┘ + +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [Codegen]: Total runtime managed DRAM tensors: 7.69894 GB +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [Codegen]: Instruction Stats: +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 49123 │ +│ LDWEIGHTS │ 48968 │ +│ ACTIVATE │ 7086 │ +│ EVENT_SEMAPHORE │ 1805 │ +│ UNKNOWN(0xd4) │ 1056 │ +│ PSEUDO_DMA_TRIGGER │ 741 │ +│ MATCH_VALUE_LOAD │ 441 │ +│ TENSOR_TENSOR │ 389 │ +│ FIND_INDEX8 │ 224 │ +│ MAX8 │ 224 │ +│ MATCH_REPLACE8 │ 217 │ +│ UNKNOWN(0xd3) │ 185 │ +│ TENSOR_SCALAR_ADDR │ 183 │ +│ UNKNOWN(0x8b) │ 100 │ +│ GATHER │ 99 │ +│ POOL_BUFFER_LOAD │ 99 │ +│ UNKNOWN(0x8a) │ 64 │ +│ TENSOR_SCALAR │ 57 │ +│ COPY │ 48 │ +│ UNKNOWN(0xda) │ 42 │ +│ TENSOR_REDUCE │ 40 │ +│ MEMSET │ 38 │ +│ UNKNOWN(0x8d) │ 32 │ +│ CAST │ 30 │ +│ PSEUDO_BRANCH_LABEL │ 20 │ +│ UNKNOWN(0xe8) │ 19 │ +│ UNKNOWN(0x92) │ 16 │ +│ ACT_TABLE_LOAD │ 15 │ +│ UNKNOWN(0xd2) │ 15 │ +│ UNKNOWN(0xcf) │ 10 │ +│ PSEUDO_DMA_REARM │ 10 │ +│ RECIPROCAL │ 7 │ +│ UNKNOWN(0xd9) │ 7 │ +│ LOAD_MASK_SELECT │ 4 │ +│ MOVE │ 4 │ +│ STREAM_SHUFFLE │ 4 │ +│ IOTA │ 4 │ +│ ALU_OP │ 2 │ +│ UNKNOWN(0xe5) │ 2 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +│ TENSOR_SCALAR │ 1 │ +│ RNG │ 1 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 2418 │ +│ Scalar │ 8490 │ +│ Tensor │ 98416 │ +│ SyncDMA │ 0 │ +│ Vector │ 2030 │ +│ Sync │ 99 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [Codegen]: Total instructions: 111453 (0.00664312 GB) +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [Codegen]: Total DynamicDMA instruction count: 1056 +2025-08-07T13:54:35Z USER 49187 (sgLnk) [Codegen]: isa_gen finished after 0.261 seconds +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────────────┼────────────────┤ +│ qActSpillReload0_defId_0 │ 6144 │ +│ qActSpillReload0_defId_1 │ 4096 │ +│ qActSpillReload0_defId_2 │ 3756 │ +│ qDVESpillReload0_defId_2 │ 8 │ +│ qPoolIO0 │ 2 │ +│ qPoolPIOParam0 │ 72 │ +│ qPoolSpillReload0_defId_0 │ 532480 │ +│ qPoolSpillReload0_defId_1 │ 512 │ +│ qPoolSpillReload0_defId_2 │ 1286 │ +│ qSPIO0 │ 36878 │ +│ qSPSpillReload0_defId_0 │ 770 │ +│ qSPSpillReload0_defId_1 │ 1024 │ +│ qSPSpillReload0_defId_2 │ 1310 │ +└───────────────────────────┴────────────────┘ + +Total descriptors: 588338 (0.00876692 GB) +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qPoolIO0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +│ qPoolPIOParam0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [Codegen]: Tensors with largest descriptor count: +┌────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ input3 │ ExternalInput │ float32 │ 3 │ +│ dot.11-buffer-1696_sg0001 │ Internal │ bfloat16 │ 8 │ +│ dot.7-buffer-1691_sg0001 │ Internal │ bfloat16 │ 8 │ +│ dot.4-buffer-2167_sg0000 │ Internal │ bfloat16 │ 8 │ +│ dot.14-buffer-2754_sg0002 │ Internal │ bfloat16 │ 8 │ +│ transpose.1_sg0000 │ Internal │ bfloat16 │ 16 │ +│ all-reduce.519.1704_sg0001 │ Internal │ bfloat16 │ 35 │ +│ add.4_sg0001 │ Internal │ bfloat16 │ 37 │ +│ all_gather.1_sg0000 │ Internal │ bfloat16 │ 40 │ +│ convert.59_sg0002 │ Internal │ float32 │ 599 │ +└────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-08-07T13:54:35Z USER 49187 (sgLnk) [Codegen]: dma_desc_gen finished after 0.014 seconds +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [Codegen]: Estimated peak DRAM usage: 7.73408 GB +2025-08-07T13:54:35Z INFO 49187 (sgLnk) [Codegen]: Generating debug info +2025-08-07T13:54:35Z WARNING 49187 (sgLnk) [Codegen]: Found 127 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-08-07T13:54:35Z USER 49187 (sgLnk) [Codegen]: debug_info_gen finished after 0.140 seconds +2025-08-07T13:54:35Z USER 49187 [ModuleForkPass]: codegen finished after 0.431 seconds +2025-08-07T13:54:35Z INFO 49187 [ModuleForkPass]: curr_vmrss: 480mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49187 [ModuleForkPass]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 61633 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [BackendPassManager]: mod_parallel_pass finished after 0.433 seconds +2025-08-07T13:54:35Z INFO 49187 [BackendPassManager]: curr_vmrss: 480mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49187 [BackendPassManager]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 61633 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z USER 49187 [BackendPassManager]: Running neff_packager +2025-08-07T13:54:35Z INFO 49187 [BackendPassManager]: Inputs to neff_packager: modules=1 functions=4 allocs=12503 blocks=4 instructions=61633 Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z INFO 49187 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.9-1303_CRSM.npy +2025-08-07T13:54:35Z INFO 49187 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_1503_CRSM.npy +2025-08-07T13:54:35Z INFO 49187 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_1487_CRSM.npy +2025-08-07T13:54:35Z INFO 49187 [NeffPackager]: FileDeDuper file not found value_sg0000_t2181_CRSM.npy +2025-08-07T13:54:35Z INFO 49187 [NeffPackager]: FileDeDuper file not found value_sg0001_identity_1171_CRSM.npy +2025-08-07T13:54:35Z INFO 49187 [NeffPackager]: FileDeDuper file not found value_sg0001_t1707_CRSM.npy +2025-08-07T13:54:35Z INFO 49187 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.24_CRSM.npy +2025-08-07T13:54:35Z INFO 49187 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.25_CRSM.npy +2025-08-07T13:54:35Z INFO 49187 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.26-812-916_CRSM.npy +2025-08-07T13:54:35Z INFO 49187 [NeffPackager]: FileDeDuper file not found value_sg0002_identity_1052_CRSM.npy +2025-08-07T13:54:35Z INFO 49187 [NeffPackager]: Const File de-dup saved 0 KB of memory footprint +2025-08-07T13:54:35Z WARNING 49187 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.169490.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-08-07T13:54:35Z INFO 49187 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.neff +2025-08-07T13:54:35Z INFO 49187 [NeffFileWriter]: IR signature: acc821adeac38c9b2a35f703d08f4858 for neff artifacts +2025-08-07T13:54:35Z USER 49187 [BackendPassManager]: neff_packager finished after 0.092 seconds +2025-08-07T13:54:35Z INFO 49187 [BackendPassManager]: curr_vmrss: 481mb, ru_maxrss: 753mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49187 [BackendPassManager]: Output has 1 module(s), 4 function(s), 12503 memory location(s), 4 block(s), and 61633 instruction(s). Max writers: 594 Max Readers: 19795 +2025-08-07T13:54:35Z INFO 49187 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ sg00 │ Peak scratchpad usage: local │ 0.006836 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: local │ 0.006836 GB │ +│ nc00 │ sg01 │ Peak scratchpad usage: local │ 0.009766 GB │ +│ nc00 │ sg01 │ Total size of allocated tensors: local │ 0.009766 GB │ +│ nc00 │ sg02 │ Peak scratchpad usage: local │ 0.005875 GB │ +│ nc00 │ sg02 │ Total size of allocated tensors: local │ 0.008148 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.009766 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.019726 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.140945 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.019726 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-08-07T13:54:35Z INFO 49187 [BackendDriver]: Backend completed successfully, tearing down. +2025-08-07T13:54:36Z INFO 47841 [job.WalrusDriver.0]: new_lnkState: {"model": ["/home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "walrus_bir.out.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "cached_wavegraph": "walrus_bir.out.json", "state_dir": "/home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/neuronxcc-k7bxqych/sgLnk/sg00", "state_id": "sgLnk"} +2025-08-07T13:54:36Z INFO 47841 [job.WalrusDriver.0]: MTBackend: completed successfully. +2025-08-07T13:54:36Z INFO 47841 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-08-07T13:54:36Z INFO 47841 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-08-07T13:54:36Z INFO 47841 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "walrus_bir.out.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "cached_wavegraph": "walrus_bir.out.json", "state_dir": "/home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/neuronxcc-k7bxqych/sgLnk/sg00", "state_id": "sgLnk"}' --pipeline BIRLinker +2025-08-07T13:54:36Z INFO 47841 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/neuronxcc-k7bxqych +2025-08-07T13:54:36Z INFO 47841 [job.BIRLinker.0]: Linking already done. +2025-08-07T13:54:36Z INFO 47841 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-08-07T13:54:36Z INFO 47841 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-08-07T13:54:36Z INFO 47841 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-08-07T13:54:36Z INFO 47841 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-08-07T13:54:36Z INFO 47841 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-08-07T13:54:36Z INFO 47841 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-08-07T13:54:36Z INFO 47841 [job.NeffWrapper.0]: Processing input #0 +2025-08-07T13:54:36Z INFO 47841 [job.NeffWrapper.0]: Start NeffWrapper +2025-08-07T13:54:36Z INFO 47841 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb --neff /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.neff --io_transposes /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/neuronxcc-k7bxqych/io_transposes.json --output /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/context_encoding_model/_tp0_bk1/neuronxcc-k7bxqych/hlo_netlist.json +2025-08-07T13:54:36Z INFO 47841 [job.NeffWrapper.0]: There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-08-07T13:54:36Z INFO 47841 [job.NeffWrapper.0]: Job #0 finished +2025-08-07T13:54:36Z INFO 47841 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-08-07T13:54:36Z INFO 47841 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-08-07T13:54:36Z INFO 47841 [pipeline.Pipeline.0]: Job #0 finished +2025-08-07T13:54:36Z INFO 47579 [root]: Subcommand returned with exitcode=0 diff --git a/context_encoding_model/_tp0_bk1/metaneff.pb b/context_encoding_model/_tp0_bk1/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..feaa7e580dc29f7d81e4bacc9d94117e26d212ad --- /dev/null +++ b/context_encoding_model/_tp0_bk1/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24459c80d98d706b0a4aca22eda28ff6c09f03a08393e76b58ee0ca668d1b851 +size 1152551 diff --git a/context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb b/context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..1aa5423f48909e5e7fd862d411027c6de7401598 --- /dev/null +++ b/context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b03debb723d63387ea26771f63729d616ac71a0dbfcb78d21d2194ff723fcbc1 +size 1229637 diff --git a/context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.neff b/context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.neff new file mode 100644 index 0000000000000000000000000000000000000000..2de3aaa470a89d9b3fd1549f8824d3ae3c51d59e --- /dev/null +++ b/context_encoding_model/_tp0_bk1/model.MODULE_2914133a46cb7b4660ab+d7af8a84.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60a8e4c285a690a146d149c675038f0498f62f761e4e3893706941d7ca8af583 +size 1659904 diff --git a/context_encoding_model/_tp0_bk1/neuron_config.json b/context_encoding_model/_tp0_bk1/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..00150a9148a646516db43dc85360ca029ef459e4 --- /dev/null +++ b/context_encoding_model/_tp0_bk1/neuron_config.json @@ -0,0 +1,220 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "Qwen/Qwen3-8B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 12288, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 36, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 1, + "bucket_n_active_tokens": true, + "buckets": [ + 256 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 2, + "chunked_prefill_config": null, + "context_encoding_buckets": [ + 256 + ], + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": true, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 1, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": false, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 1, + "lora_config": null, + "max_batch_size": 1, + "max_context_length": 1024, + "max_length": 1024, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 1024, + "n_positions": 1024, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 1024, + "pa_num_blocks": 1, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 1024, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 1, + "speculation_length": 0, + "start_rank_id": 0, + "target": null, + "tile_cc": false, + "tkg_batch_size": 1, + "token_generation_buckets": null, + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 32, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/context_encoding_model/_tp0_bk2/command.txt b/context_encoding_model/_tp0_bk2/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..41036a16ecf63b606ff3b6e0cd89383a3a43b520 --- /dev/null +++ b/context_encoding_model/_tp0_bk2/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb --output model.MODULE_00594b8bc68e927f3dbe+1ad60ced.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk2/compile_flags.MODULE_00594b8bc68e927f3dbe+1ad60ced.json b/context_encoding_model/_tp0_bk2/compile_flags.MODULE_00594b8bc68e927f3dbe+1ad60ced.json new file mode 100644 index 0000000000000000000000000000000000000000..79f14f12ddf7cafdc79db063afe6bb30333c915b --- /dev/null +++ b/context_encoding_model/_tp0_bk2/compile_flags.MODULE_00594b8bc68e927f3dbe+1ad60ced.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=1", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/log-neuron-cc.txt"] \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk2/global_metric_store.json b/context_encoding_model/_tp0_bk2/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..37cdcce0083b838d3da317a3d75879aadbaff100 --- /dev/null +++ b/context_encoding_model/_tp0_bk2/global_metric_store.json @@ -0,0 +1,1079 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 99.66542053222656, + "StaticProfiler::AveragePartitionUtilization": 97.7269515991211, + "StaticProfiler::AveragePeUtilization": 98.64861297607422, + "StaticProfiler::LocalizationEfficiency": 98.26979064941406, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 101.01405334472656, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1.0, + "StaticProfiler::AveragePartitionUtilization": 1.0, + "StaticProfiler::AveragePeUtilization": 1.0, + "StaticProfiler::LocalizationEfficiency": 1.0, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 0.018257856369018555, + "AffinePredicateResolution": 0.0011677742004394531, + "AliasDependencyElimination": 0.0001201629638671875, + "AliasDependencyInduction": 0.0052988529205322266, + "AliasDependencyReset": 0.029210567474365234, + "BFComputeCutting": 0.0032625198364257813, + "BirCodeGenLoop": 0.4527714252471924, + "CCOpFusion": 0.02410125732421875, + "CanonicalizeConv": 0.00029399999766610563, + "CanonicalizeDAGForPGTiling": 0.004324913024902344, + "CanonicalizeForTensorizer": 4.8000001697801054e-05, + "CanonicalizeIR": 0.0019502639770507813, + "Canonicalizer": 0.0010809999657794833, + "CoalesceCCOp": 0.014672040939331055, + "CommuteConcat": 0.0008339881896972656, + "DMALocalityOpt": 0.005767107009887695, + "DMAProfiler": 0.012850046157836914, + "DMATilingProfiler": 0.004332065582275391, + "DataLocalityOpt": 0.07260942459106445, + "DataStreaming": 0.03969836235046387, + "DeConcat": 0.0005326271057128906, + "DeadCodeElimination": 0.0009255409240722656, + "DeadStoreElimination": 0.0055675506591796875, + "DelinearIndices": 0.004735231399536133, + "Delinearization": 0.0030374526977539063, + "DoNothing": 0.00018930435180664063, + "DramToDramTranspose": 0.018135547637939453, + "DumpGraphAndMetadata": 0.09476375579833984, + "EliminateDivs": 0.002595663070678711, + "ExpandBatchNorm": 0.002063274383544922, + "ExpandISAMacro": 0.011973381042480469, + "FactorizeBlkDims": 0.009292840957641602, + "FactorizeThreadAxesInFreeDims": 0.0010046958923339844, + "FlattenMacroLoop": 0.002232074737548828, + "GenericAccessSimplifier": 0.0018167495727539063, + "HoistCompute": 7.999999979801942e-06, + "IdentifyCrossPassTensors": 7.79999973019585e-05, + "InferInitValue": 0.024865150451660156, + "InferIntrinsicOnCC": 0.009101152420043945, + "InferNeuronTensor": 0.023293495178222656, + "InferNonlocalTensors": 0.01632833480834961, + "InferPSumTensor": 0.27726316452026367, + "InlineNativeKernels": 0.0081634521484375, + "InsertIOTransposes": 0.019203901290893555, + "InsertLocalTransposes": 0.0042340755462646484, + "InsertOffloadedTransposes": 0.002811431884765625, + "LICM": 0.0029730796813964844, + "LateLegalizeInst": 0.014307022094726563, + "LateLegalizePostSplit": 0.012536048889160156, + "LateLowerReshapeOp": 0.0018641948699951172, + "LateLowerTensorOp": 0.0014081001281738281, + "LateNeuronInstComb": 0.00915217399597168, + "LayoutPreprocessing": 0.02658390998840332, + "LayoutPreprocessingAndAnalysis": 0.10707235336303711, + "LayoutRequirementAnalysis": 0.005135536193847656, + "LegalizeCCOpLayout": 0.002307415008544922, + "LegalizeOpLevelAlias": 0.0012297630310058594, + "LegalizePartitionReduce": 0.0010194778442382813, + "LegalizeSundaAccess": 0.07808256149291992, + "LegalizeSundaMacro": 0.010968446731567383, + "LegalizeType": 0.012074947357177734, + "LocalLayoutOpt": 0.013799905776977539, + "LoopFusion": 0.0052182674407958984, + "LoopSplitting": 0.0003161430358886719, + "LowerBroadcast": 0.0015821456909179688, + "LowerCCOpBlockAxis": 0.0040547847747802734, + "LowerComplexBroadcast": 0.002165079116821289, + "LowerIntrinsics": 0.31156492233276367, + "LowerTensorOp": 0.010558843612670898, + "LowerTranspose": 0.012494325637817383, + "MacroGeneration": 0.029862642288208008, + "MaskPropagation": 0.002757549285888672, + "MemcastMotion": 3.400000059627928e-05, + "MemcpyElimination": 0.025969266891479492, + "MutateDataType": 0.002087831497192383, + "NeuronAliasDependencyInduction": 0.00016880035400390625, + "NeuronAliasDependencyReset": 0.020352602005004883, + "NeuronInstComb": 0.004656076431274414, + "NeuronLICM": 0.03560137748718262, + "NeuronLoopFusion": 0.007991313934326172, + "NeuronLoopInterchange": 0.002409219741821289, + "NeuronSimplifier": 0.007069587707519531, + "NeuronSimplifyPredicates": 0.12419009208679199, + "NeuronValueNumbering": 0.0032753944396972656, + "OptimizeAliasedCopyChain": 0.0005936622619628906, + "OptimizeNKIKernels": 0.5374257564544678, + "PAGLayoutOpt": 0.08115577697753906, + "PComputeCutting": 0.004801273345947266, + "PGLayoutTilingPipeline": 0.5454635620117188, + "PGTiling": 0.14933419227600098, + "PadElimination": 0.00034046173095703125, + "ParAxesAnnotation": 0.053552865982055664, + "PartialLoopFusion": 0.0067539215087890625, + "PartialSimdFusion": 0.00693058967590332, + "PenguinizeFunctions": 4.5000000682193786e-05, + "PerfectLoopNest": 0.0035321712493896484, + "PruneFunctions": 5.199999941396527e-05, + "RecognizeOpIdiom": 0.003947257995605469, + "Recompute": 0.00024962425231933594, + "RelaxPredicates": 0.013285398483276367, + "Rematerialization": 0.002062082290649414, + "RemoveOptimizationBarriers": 8.70000003487803e-05, + "ReshapeWeights": 0.002131223678588867, + "ResolveAccessConflict": 0.0038597583770751953, + "ResolveComplicatePredicates": 0.002032756805419922, + "RewriteReplicationMatmul": 0.001924753189086914, + "RewriteWeights": 0.002452373504638672, + "SFKVectorizer": 0.2718319892883301, + "ScatterMotion": 3.7999998312443495e-05, + "SimpleAllReduceTiling": 0.008960247039794922, + "Simplifier": 0.004038810729980469, + "SimplifyMacroPredicates": 0.010622739791870117, + "SimplifyNeuronTensor": 1.0594146251678467, + "SimplifySlice": 0.0009577274322509766, + "SimplifyTensor": 0.005341768264770508, + "SpillPSum": 0.012076139450073242, + "SplitAPUnionSets": 0.10771751403808594, + "SplitAccGrp": 0.002201557159423828, + "StaticProfiler": 0.012447118759155273, + "StaticTransposeLocalTensor": 0.0038712024688720703, + "SundaISel": 0.04214668273925781, + "TCTransform": 0.0008432865142822266, + "TensorInitialization": 0.012825727462768555, + "TensorOpSimplifier": 0.004651308059692383, + "TensorOpTransform": 0.019537687301635742, + "TensorizerLegalizationPass": 5.7999997807201e-05, + "TileCCOps": 0.006766319274902344, + "TilingProfiler": 0.006911277770996094, + "TransformConvOp": 0.0030303001403808594, + "TritiumFusion": 0.04502224922180176, + "ValueNumbering": 0.001996755599975586, + "VectorizeDMA": 0.0019402503967285156, + "VectorizeMatMult": 0.0027413368225097656, + "VerifySupportedOps": 3.7000001611886546e-05, + "WeightCoalescing": 0.008520841598510742, + "ZeroSizeTensorElimination": 0.00013709068298339844, + "algsimp": 0.0026940000243484974, + "batchnorm_expander": 4.400000034365803e-05, + "boundary-marker-removal": 1.5999999959603883e-05, + "call-inliner": 0.00046999999904073775, + "canonicalize-boundary-marker": 1.8999999156221747e-05, + "collective-stream-id-checker": 7.300000288523734e-05, + "comparison-expander": 0.0005740000051446259, + "computation-deduplicator": 7.999999797903001e-05, + "conditional-to-select": 1.8000000636675395e-05, + "config-lowering": 0.0003279999946244061, + "constant-statistics": 0.0005329999839887023, + "constant_folding": 0.0003260000084992498, + "cse": 4.5000000682193786e-05, + "dce": 8.399999933317304e-05, + "dot_decomposer": 0.0013409999664872885, + "dynamic-slice-transpose": 1.3999999282532372e-05, + "eliminate-redundant-compare": 0.0002959999837912619, + "emit-offloaded-dropout": 6.399999983841553e-05, + "flatten-call-graph": 0.0009319999953731894, + "fuse-send-recv": 6.999999459367245e-05, + "hilo::LegalizeAlias": 1.3999999282532372e-05, + "hilo::NeuronInstCombine": 0.0001660000125411898, + "hilo::NeuronOpFusion": 2.5000001187436283e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 5.2999999752501026e-05, + "hilo::ScheduleFusion": 7.000000096013537e-06, + "hilo::SixtyFourHack": 7.299999560927972e-05, + "hilo::VerifyAliasing": 6.000000212225132e-06, + "hlo-mac-count": 0.0013429999817162752, + "hlo-verifier": 0.007542999926954508, + "instruction-histogram": 0.0006709999870508909, + "io-con-pipe-begin": 4.999999873689376e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.001310999970883131, + "io-statistics": 8.499999967170879e-05, + "legalize-ccops": 3.999999989900971e-06, + "legalize-compare": 1.2999999853491317e-05, + "lower-argminmax-custom-call": 1.300000076298602e-05, + "map-inline": 0.0008850000449456275, + "metadata-naming": 5.999999848427251e-05, + "mlir::detail::OpToOpPassAdaptor": 0.00014399999054148793, + "mlir::hlo::MhloToPyPenguin": 0.004429999738931656, + "mlir::mhlo::LowerComplexExtraPass": 0.00027299998328089714, + "mlir::mhlo::LowerComplexPass": 0.0004909999552182853, + "native-to-custom-softmax": 0.0007070000283420086, + "native-to-custom-softmax-dx": 0.0005990000208839774, + "operand_upcaster": 4.900000203633681e-05, + "opt-barrier-removal": 0.0005510000046342611, + "post-par-pipe-begin": 8.999999408842996e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0018570000538602471, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.12893199920654297, + "replace-minimum-constant": 0.0004569999873638153, + "reshape-mover": 0.00012599999899975955, + "simplify-concat": 0.00015899998834356666, + "simplify-while-loops": 0.00010400000610388815, + "transform-variadic-reduce": 7.000000186963007e-05, + "tuple-simplifier": 0.0003150000120513141, + "unpack-nested-aws-ntwsr": 0.0004349999944679439, + "unroll-while-loop": 2.099999983329326e-05, + "zero_sized_hlo_elimination": 0.0008670000243000686 + }, + "hilo": { + "ConstantSize": 1189157.0, + "HloInputCount": 475.0, + "HloMacCount": 101242896384.0, + "HloOutputCount": 73.0, + "IfmapSize": 8266545152.0, + "OfmapSize": 75497472.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 1692493184.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 44382.0, + "StaticProfiler::AifUb": 205.154296875, + "StaticProfiler::ArithmeticIntensityTensorizer": 201.6046905517578, + "StaticProfiler::AverageDmaLength": 1901.806396484375, + "StaticProfiler::DDRTransferBytes": 795531072.0, + "StaticProfiler::InternalTransferBytes": 646388224.0, + "StaticProfiler::LoadExpanded": 376342.0, + "StaticProfiler::StoreExpanded": 4189.0, + "StaticProfiler::TotalDMAExpanded": 380531.0, + "StaticProfiler::TotalDynamicInstancesCount": 53882.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 53436.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 23616.0, + "TilingProfiler::NumPfTransposes": 5.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 19393.0, + "TilingProfiler::PfTransposeInstructionsForIo": 19008.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 384.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 4.0, + "TilingProfiler::SimdInstructionsAfterTiling": 158.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "all": { + "compiletime": { + "algsimp": 0.002466999925673008, + "call-inliner": 0.0004360000020824373, + "collective-stream-id-checker": 6.299999949987978e-05, + "comparison-expander": 0.0005569999921135604, + "constant-statistics": 0.0005329999839887023, + "constant_folding": 0.0002969999914057553, + "dce": 7.999999797903001e-05, + "dot_decomposer": 0.0013409999664872885, + "eliminate-redundant-compare": 0.00028199999360367656, + "flatten-call-graph": 0.0008999999845400453, + "hlo-mac-count": 0.0010720000136643648, + "hlo-verifier": 0.0069679999724030495, + "instruction-histogram": 0.0006709999870508909, + "io-con-pipe-begin": 4.999999873689376e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.001310999970883131, + "io-statistics": 8.499999967170879e-05, + "map-inline": 0.0008440000237897038, + "native-to-custom-softmax": 0.0006750000175088644, + "native-to-custom-softmax-dx": 0.0005000000237487257, + "opt-barrier-removal": 0.0005510000046342611, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.12893199920654297, + "replace-minimum-constant": 0.0004309999931138009, + "reshape-mover": 0.00011500000255182385, + "simplify-while-loops": 9.600000339560211e-05, + "tuple-simplifier": 0.0002969999914057553, + "unpack-nested-aws-ntwsr": 0.00042100000428035855, + "unroll-while-loop": 1.9999999494757503e-05, + "zero_sized_hlo_elimination": 0.0008670000243000686 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.00020885467529296875, + "DMALocalityOpt": 0.00016832351684570313, + "DMAProfiler": 0.0007588863372802734, + "DataStreaming": 0.00029587745666503906, + "DoNothing": 0.00011897087097167969, + "ExpandISAMacro": 0.0005011558532714844, + "FactorizeBlkDims": 0.00043463706970214844, + "InferPSumTensor": 0.00044608116149902344, + "LateLegalizeInst": 0.0004031658172607422, + "LateNeuronInstComb": 0.0005033016204833984, + "LegalizeSundaAccess": 0.0021431446075439453, + "LegalizeType": 0.00024056434631347656, + "LowerBroadcast": 0.00022101402282714844, + "LowerIntrinsics": 0.00023508071899414063, + "LowerTranspose": 0.0002219676971435547, + "NeuronInstComb": 0.0005297660827636719, + "NeuronLICM": 0.00041484832763671875, + "NeuronSimplifyPredicates": 0.0028023719787597656, + "NeuronValueNumbering": 0.00043582916259765625, + "SFKVectorizer": 0.002759695053100586, + "SimpleAllReduceTiling": 0.00020432472229003906, + "SimplifyNeuronTensor": 0.0004029273986816406, + "SpillPSum": 0.0005388259887695313, + "WeightCoalescing": 0.0002307891845703125 + } + }, + "sg00": { + "compiletime": { + "CanonicalizeConv": 2.300000051036477e-05, + "CanonicalizeForTensorizer": 2.300000051036477e-05, + "Canonicalizer": 0.0005249999812804163, + "HoistCompute": 3.000000106112566e-06, + "IdentifyCrossPassTensors": 3.099999958067201e-05, + "MemcastMotion": 9.999999747378752e-06, + "PenguinizeFunctions": 2.2000000171829015e-05, + "PruneFunctions": 1.2999999853491317e-05, + "RemoveOptimizationBarriers": 4.400000034365803e-05, + "ScatterMotion": 6.000000212225132e-06, + "TensorizerLegalizationPass": 3.600000127335079e-05, + "VerifySupportedOps": 1.700000029813964e-05, + "algsimp": 0.0001049999991664663, + "batchnorm_expander": 1.8999999156221747e-05, + "boundary-marker-removal": 7.000000096013537e-06, + "call-inliner": 1.4000000192027073e-05, + "canonicalize-boundary-marker": 7.999999979801942e-06, + "collective-stream-id-checker": 3.999999989900971e-06, + "comparison-expander": 7.000000096013537e-06, + "computation-deduplicator": 2.099999983329326e-05, + "conditional-to-select": 7.000000096013537e-06, + "config-lowering": 0.00027600000612437725, + "constant_folding": 1.2000000424450263e-05, + "cse": 2.2000000171829015e-05, + "dce": 1.9999999949504854e-06, + "dynamic-slice-transpose": 6.000000212225132e-06, + "eliminate-redundant-compare": 6.000000212225132e-06, + "emit-offloaded-dropout": 3.7999998312443495e-05, + "flatten-call-graph": 1.2999999853491317e-05, + "fuse-send-recv": 3.099999958067201e-05, + "hilo::LegalizeAlias": 7.000000096013537e-06, + "hilo::NeuronInstCombine": 6.299999949987978e-05, + "hilo::NeuronOpFusion": 6.000000212225132e-06, + "hilo::ReplaceTokenTypeWithU8Pass": 2.300000051036477e-05, + "hilo::ScheduleFusion": 1.9999999949504854e-06, + "hilo::SixtyFourHack": 2.099999983329326e-05, + "hilo::VerifyAliasing": 3.000000106112566e-06, + "hlo-mac-count": 7.300000288523734e-05, + "hlo-verifier": 0.00023600000713486224, + "legalize-ccops": 1.9999999949504854e-06, + "legalize-compare": 6.000000212225132e-06, + "lower-argminmax-custom-call": 6.000000212225132e-06, + "map-inline": 1.700000029813964e-05, + "metadata-naming": 2.499999936844688e-05, + "mlir::detail::OpToOpPassAdaptor": 2.2000000171829015e-05, + "mlir::hlo::MhloToPyPenguin": 0.002633000025525689, + "mlir::mhlo::LowerComplexExtraPass": 0.0001049999991664663, + "mlir::mhlo::LowerComplexPass": 0.00017299999308306724, + "native-to-custom-softmax": 2.099999983329326e-05, + "native-to-custom-softmax-dx": 6.600000051548705e-05, + "operand_upcaster": 2.2000000171829015e-05, + "post-par-pipe-begin": 4.999999873689376e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0008430000161752105, + "replace-minimum-constant": 1.1000000085914508e-05, + "reshape-mover": 4.999999873689376e-06, + "simplify-concat": 6.70000008540228e-05, + "simplify-while-loops": 3.999999989900971e-06, + "transform-variadic-reduce": 1.2999999853491317e-05, + "tuple-simplifier": 7.999999979801942e-06, + "unpack-nested-aws-ntwsr": 6.000000212225132e-06, + "unroll-while-loop": 9.999999974752427e-07 + }, + "hilo": { + "ArithmeticIntensity": 34.445003509521484, + "ConstantSize": 1189157.0, + "HloInputCount": 475.0, + "HloMacCount": 11811160064.0, + "HloOutputCount": 73.0, + "IfmapSize": 8266545152.0, + "OfmapSize": 75497472.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 685798208.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 0.07801461219787598, + "AffinePredicateResolution": 0.0017647743225097656, + "AliasDependencyElimination": 0.0001277923583984375, + "AliasDependencyInduction": 0.00855708122253418, + "AliasDependencyReset": 0.08457040786743164, + "BFComputeCutting": 0.003294229507446289, + "BirCodeGenLoop": 0.05274701118469238, + "CCOpFusion": 0.030017614364624023, + "CanonicalizeDAGForPGTiling": 0.003341197967529297, + "CanonicalizeIR": 0.0022792816162109375, + "CoalesceCCOp": 0.0053555965423583984, + "CommuteConcat": 0.0023560523986816406, + "DMALocalityOpt": 0.0013885498046875, + "DMAProfiler": 0.00625157356262207, + "DMATilingProfiler": 0.003763914108276367, + "DataLocalityOpt": 0.09786868095397949, + "DataStreaming": 0.004992246627807617, + "DeConcat": 0.002264261245727539, + "DeadCodeElimination": 0.002042531967163086, + "DeadStoreElimination": 0.030755043029785156, + "DelinearIndices": 0.009100914001464844, + "Delinearization": 0.004424571990966797, + "DoNothing": 6.914138793945313e-05, + "DramToDramTranspose": 0.03130936622619629, + "DumpGraphAndMetadata": 0.005283832550048828, + "EliminateDivs": 0.0042150020599365234, + "ExpandBatchNorm": 0.0019366741180419922, + "ExpandISAMacro": 0.002724170684814453, + "FactorizeBlkDims": 0.011873722076416016, + "FactorizeThreadAxesInFreeDims": 0.002283811569213867, + "FlattenMacroLoop": 0.0031974315643310547, + "GenericAccessSimplifier": 0.002216339111328125, + "InferInitValue": 0.030458927154541016, + "InferIntrinsicOnCC": 0.011402368545532227, + "InferNeuronTensor": 0.04513859748840332, + "InferNonlocalTensors": 0.10613727569580078, + "InferPSumTensor": 0.037427663803100586, + "InlineNativeKernels": 0.00368499755859375, + "InsertIOTransposes": 0.012629508972167969, + "InsertLocalTransposes": 0.007400989532470703, + "InsertOffloadedTransposes": 0.0025758743286132813, + "LICM": 0.0031554698944091797, + "LateLegalizeInst": 0.005858182907104492, + "LateLegalizePostSplit": 0.0029172897338867188, + "LateLowerReshapeOp": 0.0018696784973144531, + "LateLowerTensorOp": 0.004997968673706055, + "LateNeuronInstComb": 0.019808530807495117, + "LayoutPreprocessing": 0.04119300842285156, + "LayoutPreprocessingAndAnalysis": 0.10642147064208984, + "LayoutRequirementAnalysis": 0.0070705413818359375, + "LegalizeCCOpLayout": 0.004191398620605469, + "LegalizeOpLevelAlias": 0.0015521049499511719, + "LegalizePartitionReduce": 0.002257108688354492, + "LegalizeSundaAccess": 0.03900027275085449, + "LegalizeSundaMacro": 0.010483741760253906, + "LegalizeType": 0.0038602352142333984, + "LocalLayoutOpt": 0.01764845848083496, + "LoopFusion": 0.006066322326660156, + "LoopSplitting": 0.0015685558319091797, + "LowerBroadcast": 0.0020384788513183594, + "LowerCCOpBlockAxis": 0.005359172821044922, + "LowerComplexBroadcast": 0.0019440650939941406, + "LowerIntrinsics": 0.030491113662719727, + "LowerTensorOp": 0.012917041778564453, + "LowerTranspose": 0.010635852813720703, + "MacroGeneration": 0.06435012817382813, + "MaskPropagation": 0.0051097869873046875, + "MemcpyElimination": 0.11022067070007324, + "MutateDataType": 0.0014224052429199219, + "NeuronAliasDependencyInduction": 0.00023031234741210938, + "NeuronAliasDependencyReset": 0.021604061126708984, + "NeuronInstComb": 0.013072729110717773, + "NeuronLICM": 0.01006174087524414, + "NeuronLoopFusion": 0.017573833465576172, + "NeuronLoopInterchange": 0.0020608901977539063, + "NeuronSimplifier": 0.010074615478515625, + "NeuronSimplifyPredicates": 0.0060672760009765625, + "NeuronValueNumbering": 0.0041046142578125, + "OptimizeAliasedCopyChain": 0.0014190673828125, + "OptimizeNKIKernels": 0.0021109580993652344, + "PAGLayoutOpt": 0.3779466152191162, + "PComputeCutting": 0.008729696273803711, + "PGLayoutTilingPipeline": 1.5334703922271729, + "PGTiling": 0.47260475158691406, + "PadElimination": 0.0015625953674316406, + "ParAxesAnnotation": 0.2937772274017334, + "PartialLoopFusion": 0.016366004943847656, + "PartialSimdFusion": 0.01980447769165039, + "PerfectLoopNest": 0.0021877288818359375, + "RecognizeOpIdiom": 0.004831075668334961, + "Recompute": 0.00025010108947753906, + "RelaxPredicates": 0.0039484500885009766, + "Rematerialization": 0.004274129867553711, + "ReshapeWeights": 0.000804901123046875, + "ResolveAccessConflict": 0.0038733482360839844, + "ResolveComplicatePredicates": 0.0016858577728271484, + "RewriteReplicationMatmul": 0.0014014244079589844, + "RewriteWeights": 0.00405120849609375, + "SFKVectorizer": 0.20196890830993652, + "SimpleAllReduceTiling": 0.002203702926635742, + "Simplifier": 0.004297018051147461, + "SimplifyMacroPredicates": 0.01361393928527832, + "SimplifyNeuronTensor": 0.009984970092773438, + "SimplifySlice": 0.0010356903076171875, + "SimplifyTensor": 0.006205558776855469, + "SpillPSum": 0.016466140747070313, + "SplitAPUnionSets": 0.029446840286254883, + "SplitAccGrp": 0.0020453929901123047, + "StaticProfiler": 0.004591464996337891, + "StaticTransposeLocalTensor": 0.005173683166503906, + "SundaISel": 0.04554462432861328, + "TCTransform": 0.002426624298095703, + "TensorInitialization": 0.009510517120361328, + "TensorOpSimplifier": 0.0067560672760009766, + "TensorOpTransform": 0.028885841369628906, + "TileCCOps": 0.005466938018798828, + "TilingProfiler": 0.013426065444946289, + "TransformConvOp": 0.002458810806274414, + "TritiumFusion": 0.0620732307434082, + "ValueNumbering": 0.002520322799682617, + "VectorizeDMA": 0.005783796310424805, + "VectorizeMatMult": 0.005175352096557617, + "WeightCoalescing": 0.0029850006103515625, + "ZeroSizeTensorElimination": 0.00011801719665527344 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 2597.0, + "StaticProfiler::AifUb": 40.028141021728516, + "StaticProfiler::ArithmeticIntensityTensorizer": 420.0349426269531, + "StaticProfiler::AverageDmaLength": 1921.007568359375, + "StaticProfiler::AverageFractalPeUtilization": 99.95317840576172, + "StaticProfiler::AveragePartitionUtilization": 99.87249755859375, + "StaticProfiler::AveragePeUtilization": 99.80845642089844, + "StaticProfiler::DDRTransferBytes": 64558336.0, + "StaticProfiler::InternalTransferBytes": 52297728.0, + "StaticProfiler::LoadExpanded": 23298.0, + "StaticProfiler::LocalizationEfficiency": 1049.3489990234375, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1358.191162109375, + "StaticProfiler::StoreExpanded": 5505.0, + "StaticProfiler::TotalDMAExpanded": 28803.0, + "StaticProfiler::TotalDynamicInstancesCount": 3692.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 3689.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 48.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 1412.0, + "TilingProfiler::NumPfTransposes": 7.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 5.0, + "TilingProfiler::NumPfTransposesForNonlocal": 1.0, + "TilingProfiler::PfTransposeInstructions": 608.0, + "TilingProfiler::PfTransposeInstructionsForIo": 128.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 416.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 64.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 257.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0001": { + "compiletime": { + "AGOrderingAnalysisPass": 0.03313565254211426, + "AffinePredicateResolution": 0.0015239715576171875, + "AliasDependencyElimination": 0.00011467933654785156, + "AliasDependencyInduction": 0.009088993072509766, + "AliasDependencyReset": 1.062025547027588, + "BFComputeCutting": 0.0024559497833251953, + "BirCodeGenLoop": 0.03748297691345215, + "CCOpFusion": 0.04092240333557129, + "CanonicalizeDAGForPGTiling": 0.004329681396484375, + "CanonicalizeIR": 0.002464771270751953, + "CoalesceCCOp": 0.004778146743774414, + "CommuteConcat": 0.0011680126190185547, + "DMALocalityOpt": 0.0016834735870361328, + "DMAProfiler": 0.0039997100830078125, + "DMATilingProfiler": 0.004555702209472656, + "DataLocalityOpt": 0.13762187957763672, + "DataStreaming": 0.0044286251068115234, + "DeConcat": 0.0015981197357177734, + "DeadCodeElimination": 0.0020780563354492188, + "DeadStoreElimination": 0.03435230255126953, + "DelinearIndices": 0.00969839096069336, + "Delinearization": 0.0038826465606689453, + "DoNothing": 9.846687316894531e-05, + "DramToDramTranspose": 0.03438973426818848, + "DumpGraphAndMetadata": 0.00426793098449707, + "EliminateDivs": 0.004217386245727539, + "ExpandBatchNorm": 0.0019202232360839844, + "ExpandISAMacro": 0.0024042129516601563, + "FactorizeBlkDims": 0.01425933837890625, + "FactorizeThreadAxesInFreeDims": 0.0026972293853759766, + "FlattenMacroLoop": 0.002768993377685547, + "GenericAccessSimplifier": 0.001058816909790039, + "InferInitValue": 0.03559255599975586, + "InferIntrinsicOnCC": 0.009636163711547852, + "InferNeuronTensor": 0.04922318458557129, + "InferNonlocalTensors": 0.030732393264770508, + "InferPSumTensor": 0.03249359130859375, + "InlineNativeKernels": 0.0014734268188476563, + "InsertIOTransposes": 0.021765470504760742, + "InsertLocalTransposes": 0.006593465805053711, + "InsertOffloadedTransposes": 0.0034906864166259766, + "LICM": 0.003262758255004883, + "LateLegalizeInst": 0.00400543212890625, + "LateLegalizePostSplit": 0.00289154052734375, + "LateLowerReshapeOp": 0.002287149429321289, + "LateLowerTensorOp": 0.0046651363372802734, + "LateNeuronInstComb": 0.019269704818725586, + "LayoutPreprocessing": 0.03711414337158203, + "LayoutPreprocessingAndAnalysis": 0.2516040802001953, + "LayoutRequirementAnalysis": 0.007753133773803711, + "LegalizeCCOpLayout": 0.003732919692993164, + "LegalizeOpLevelAlias": 0.0016019344329833984, + "LegalizePartitionReduce": 0.0020945072174072266, + "LegalizeSundaAccess": 0.016069650650024414, + "LegalizeSundaMacro": 0.010806083679199219, + "LegalizeType": 0.004706859588623047, + "LocalLayoutOpt": 0.02442765235900879, + "LoopFusion": 0.0067822933197021484, + "LoopSplitting": 0.00033974647521972656, + "LowerBroadcast": 0.0019419193267822266, + "LowerCCOpBlockAxis": 0.005570650100708008, + "LowerComplexBroadcast": 0.0020999908447265625, + "LowerIntrinsics": 0.03607368469238281, + "LowerTensorOp": 0.011876583099365234, + "LowerTranspose": 0.011530637741088867, + "MacroGeneration": 0.10653066635131836, + "MaskPropagation": 0.003092050552368164, + "MemcpyElimination": 0.10495471954345703, + "MutateDataType": 0.0014193058013916016, + "NeuronAliasDependencyInduction": 0.0002295970916748047, + "NeuronAliasDependencyReset": 0.021070480346679688, + "NeuronInstComb": 0.012903451919555664, + "NeuronLICM": 0.00844264030456543, + "NeuronLoopFusion": 0.020880460739135742, + "NeuronLoopInterchange": 0.0021686553955078125, + "NeuronSimplifier": 0.011090755462646484, + "NeuronSimplifyPredicates": 0.0016274452209472656, + "NeuronValueNumbering": 0.004062652587890625, + "OptimizeAliasedCopyChain": 0.0014641284942626953, + "OptimizeNKIKernels": 0.0023856163024902344, + "PAGLayoutOpt": 0.17638587951660156, + "PComputeCutting": 0.00709986686706543, + "PGLayoutTilingPipeline": 1.142796516418457, + "PGTiling": 0.39766955375671387, + "PadElimination": 0.0015380382537841797, + "ParAxesAnnotation": 0.09186458587646484, + "PartialLoopFusion": 0.015995025634765625, + "PartialSimdFusion": 0.026766300201416016, + "PerfectLoopNest": 0.002192258834838867, + "RecognizeOpIdiom": 0.004943370819091797, + "Recompute": 0.00025773048400878906, + "RelaxPredicates": 0.003591299057006836, + "Rematerialization": 0.0025196075439453125, + "ReshapeWeights": 0.0007069110870361328, + "ResolveAccessConflict": 0.00481104850769043, + "ResolveComplicatePredicates": 0.002285003662109375, + "RewriteReplicationMatmul": 0.0021715164184570313, + "RewriteWeights": 0.003401041030883789, + "SFKVectorizer": 0.14661574363708496, + "SimpleAllReduceTiling": 0.0016207695007324219, + "Simplifier": 0.00443577766418457, + "SimplifyMacroPredicates": 0.006165742874145508, + "SimplifyNeuronTensor": 0.006829500198364258, + "SimplifySlice": 0.0013000965118408203, + "SimplifyTensor": 0.0061337947845458984, + "SpillPSum": 0.018761634826660156, + "SplitAPUnionSets": 0.017923593521118164, + "SplitAccGrp": 0.002531290054321289, + "StaticProfiler": 0.003990888595581055, + "StaticTransposeLocalTensor": 0.004915952682495117, + "SundaISel": 0.04209589958190918, + "TCTransform": 0.0012347698211669922, + "TensorInitialization": 0.002599954605102539, + "TensorOpSimplifier": 0.006845712661743164, + "TensorOpTransform": 0.03345227241516113, + "TileCCOps": 0.005617856979370117, + "TilingProfiler": 0.015013933181762695, + "TransformConvOp": 0.002393960952758789, + "TritiumFusion": 0.09340715408325195, + "ValueNumbering": 0.0031540393829345703, + "VectorizeDMA": 0.0015842914581298828, + "VectorizeMatMult": 0.0071103572845458984, + "WeightCoalescing": 0.0026235580444335938, + "ZeroSizeTensorElimination": 0.0001163482666015625 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 7847.0, + "StaticProfiler::AifUb": 490.6532287597656, + "StaticProfiler::ArithmeticIntensityTensorizer": 487.63507080078125, + "StaticProfiler::AverageDmaLength": 869.1515502929688, + "StaticProfiler::AverageFractalPeUtilization": 100.0, + "StaticProfiler::AveragePartitionUtilization": 99.83790588378906, + "StaticProfiler::AveragePeUtilization": 100.0, + "StaticProfiler::DDRTransferBytes": 215827456.0, + "StaticProfiler::InternalTransferBytes": 43515904.0, + "StaticProfiler::LoadExpanded": 238976.0, + "StaticProfiler::LocalizationEfficiency": 99.38487243652344, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 107.76165771484375, + "StaticProfiler::StoreExpanded": 5121.0, + "StaticProfiler::TotalDMAExpanded": 244097.0, + "StaticProfiler::TotalDynamicInstancesCount": 9872.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 9872.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 32.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 6016.0, + "TilingProfiler::NumPfTransposes": 8.0, + "TilingProfiler::NumPfTransposesForIo": 3.0, + "TilingProfiler::NumPfTransposesForLocal": 3.0, + "TilingProfiler::NumPfTransposesForNonlocal": 2.0, + "TilingProfiler::PfTransposeInstructions": 680.0, + "TilingProfiler::PfTransposeInstructionsForIo": 136.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 288.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 256.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 288.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0002": { + "compiletime": { + "AGOrderingAnalysisPass": 0.018257856369018555, + "AffinePredicateResolution": 0.0011677742004394531, + "AliasDependencyElimination": 0.0001201629638671875, + "AliasDependencyInduction": 0.0052988529205322266, + "AliasDependencyReset": 0.029210567474365234, + "BFComputeCutting": 0.0032625198364257813, + "BirCodeGenLoop": 0.4527714252471924, + "CCOpFusion": 0.02410125732421875, + "CanonicalizeDAGForPGTiling": 0.004324913024902344, + "CanonicalizeIR": 0.0019502639770507813, + "CoalesceCCOp": 0.014463186264038086, + "CommuteConcat": 0.0008339881896972656, + "DMALocalityOpt": 0.005598783493041992, + "DMAProfiler": 0.01209115982055664, + "DMATilingProfiler": 0.004332065582275391, + "DataLocalityOpt": 0.07260942459106445, + "DataStreaming": 0.03940248489379883, + "DeConcat": 0.0005326271057128906, + "DeadCodeElimination": 0.0009255409240722656, + "DeadStoreElimination": 0.0055675506591796875, + "DelinearIndices": 0.004735231399536133, + "Delinearization": 0.0030374526977539063, + "DoNothing": 7.033348083496094e-05, + "DramToDramTranspose": 0.018135547637939453, + "DumpGraphAndMetadata": 0.09476375579833984, + "EliminateDivs": 0.002595663070678711, + "ExpandBatchNorm": 0.002063274383544922, + "ExpandISAMacro": 0.011472225189208984, + "FactorizeBlkDims": 0.008858203887939453, + "FactorizeThreadAxesInFreeDims": 0.0010046958923339844, + "FlattenMacroLoop": 0.002232074737548828, + "GenericAccessSimplifier": 0.0018167495727539063, + "InferInitValue": 0.024865150451660156, + "InferIntrinsicOnCC": 0.009101152420043945, + "InferNeuronTensor": 0.023293495178222656, + "InferNonlocalTensors": 0.01632833480834961, + "InferPSumTensor": 0.27681708335876465, + "InlineNativeKernels": 0.0081634521484375, + "InsertIOTransposes": 0.019203901290893555, + "InsertLocalTransposes": 0.0042340755462646484, + "InsertOffloadedTransposes": 0.002811431884765625, + "LICM": 0.0029730796813964844, + "LateLegalizeInst": 0.01390385627746582, + "LateLegalizePostSplit": 0.012536048889160156, + "LateLowerReshapeOp": 0.0018641948699951172, + "LateLowerTensorOp": 0.0014081001281738281, + "LateNeuronInstComb": 0.008648872375488281, + "LayoutPreprocessing": 0.02658390998840332, + "LayoutPreprocessingAndAnalysis": 0.10707235336303711, + "LayoutRequirementAnalysis": 0.005135536193847656, + "LegalizeCCOpLayout": 0.002307415008544922, + "LegalizeOpLevelAlias": 0.0012297630310058594, + "LegalizePartitionReduce": 0.0010194778442382813, + "LegalizeSundaAccess": 0.07593941688537598, + "LegalizeSundaMacro": 0.010968446731567383, + "LegalizeType": 0.011834383010864258, + "LocalLayoutOpt": 0.013799905776977539, + "LoopFusion": 0.0052182674407958984, + "LoopSplitting": 0.0003161430358886719, + "LowerBroadcast": 0.0013611316680908203, + "LowerCCOpBlockAxis": 0.0040547847747802734, + "LowerComplexBroadcast": 0.002165079116821289, + "LowerIntrinsics": 0.31132984161376953, + "LowerTensorOp": 0.010558843612670898, + "LowerTranspose": 0.012272357940673828, + "MacroGeneration": 0.029862642288208008, + "MaskPropagation": 0.002757549285888672, + "MemcpyElimination": 0.025969266891479492, + "MutateDataType": 0.002087831497192383, + "NeuronAliasDependencyInduction": 0.00016880035400390625, + "NeuronAliasDependencyReset": 0.020352602005004883, + "NeuronInstComb": 0.004126310348510742, + "NeuronLICM": 0.0351865291595459, + "NeuronLoopFusion": 0.007991313934326172, + "NeuronLoopInterchange": 0.002409219741821289, + "NeuronSimplifier": 0.007069587707519531, + "NeuronSimplifyPredicates": 0.12138772010803223, + "NeuronValueNumbering": 0.0028395652770996094, + "OptimizeAliasedCopyChain": 0.0005936622619628906, + "OptimizeNKIKernels": 0.5374257564544678, + "PAGLayoutOpt": 0.08115577697753906, + "PComputeCutting": 0.004801273345947266, + "PGLayoutTilingPipeline": 0.5454635620117188, + "PGTiling": 0.14933419227600098, + "PadElimination": 0.00034046173095703125, + "ParAxesAnnotation": 0.053552865982055664, + "PartialLoopFusion": 0.0067539215087890625, + "PartialSimdFusion": 0.00693058967590332, + "PerfectLoopNest": 0.0035321712493896484, + "RecognizeOpIdiom": 0.003947257995605469, + "Recompute": 0.00024962425231933594, + "RelaxPredicates": 0.013285398483276367, + "Rematerialization": 0.002062082290649414, + "ReshapeWeights": 0.002131223678588867, + "ResolveAccessConflict": 0.0038597583770751953, + "ResolveComplicatePredicates": 0.002032756805419922, + "RewriteReplicationMatmul": 0.001924753189086914, + "RewriteWeights": 0.002452373504638672, + "SFKVectorizer": 0.2690722942352295, + "SimpleAllReduceTiling": 0.008755922317504883, + "Simplifier": 0.004038810729980469, + "SimplifyMacroPredicates": 0.010622739791870117, + "SimplifyNeuronTensor": 1.059011697769165, + "SimplifySlice": 0.0009577274322509766, + "SimplifyTensor": 0.005341768264770508, + "SpillPSum": 0.011537313461303711, + "SplitAPUnionSets": 0.10771751403808594, + "SplitAccGrp": 0.002201557159423828, + "StaticProfiler": 0.012447118759155273, + "StaticTransposeLocalTensor": 0.0038712024688720703, + "SundaISel": 0.04214668273925781, + "TCTransform": 0.0008432865142822266, + "TensorInitialization": 0.012825727462768555, + "TensorOpSimplifier": 0.004651308059692383, + "TensorOpTransform": 0.019537687301635742, + "TileCCOps": 0.006766319274902344, + "TilingProfiler": 0.006911277770996094, + "TransformConvOp": 0.0030303001403808594, + "TritiumFusion": 0.04502224922180176, + "ValueNumbering": 0.001996755599975586, + "VectorizeDMA": 0.0019402503967285156, + "VectorizeMatMult": 0.0027413368225097656, + "WeightCoalescing": 0.00829005241394043, + "ZeroSizeTensorElimination": 0.00013709068298339844 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 44382.0, + "StaticProfiler::AifUb": 205.154296875, + "StaticProfiler::ArithmeticIntensityTensorizer": 201.6046905517578, + "StaticProfiler::AverageDmaLength": 1901.806396484375, + "StaticProfiler::AverageFractalPeUtilization": 99.66542053222656, + "StaticProfiler::AveragePartitionUtilization": 97.7269515991211, + "StaticProfiler::AveragePeUtilization": 98.64861297607422, + "StaticProfiler::DDRTransferBytes": 795531072.0, + "StaticProfiler::InternalTransferBytes": 646388224.0, + "StaticProfiler::LoadExpanded": 376342.0, + "StaticProfiler::LocalizationEfficiency": 98.26979064941406, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 101.01405334472656, + "StaticProfiler::StoreExpanded": 4189.0, + "StaticProfiler::TotalDMAExpanded": 380531.0, + "StaticProfiler::TotalDynamicInstancesCount": 53882.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 53436.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 23616.0, + "TilingProfiler::NumPfTransposes": 5.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 19393.0, + "TilingProfiler::PfTransposeInstructionsForIo": 19008.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 384.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 4.0, + "TilingProfiler::SimdInstructionsAfterTiling": 158.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg01": { + "compiletime": { + "CanonicalizeConv": 1.2000000424450263e-05, + "CanonicalizeForTensorizer": 1.2999999853491317e-05, + "Canonicalizer": 0.0002500000118743628, + "HoistCompute": 3.000000106112566e-06, + "IdentifyCrossPassTensors": 2.300000051036477e-05, + "MemcastMotion": 1.1000000085914508e-05, + "PenguinizeFunctions": 1.4000000192027073e-05, + "PruneFunctions": 3.099999958067201e-05, + "RemoveOptimizationBarriers": 2.2000000171829015e-05, + "ScatterMotion": 2.9999999242136255e-05, + "TensorizerLegalizationPass": 1.700000029813964e-05, + "VerifySupportedOps": 9.000000318337698e-06, + "algsimp": 6.299999949987978e-05, + "batchnorm_expander": 1.2999999853491317e-05, + "boundary-marker-removal": 4.999999873689376e-06, + "call-inliner": 9.000000318337698e-06, + "canonicalize-boundary-marker": 6.000000212225132e-06, + "collective-stream-id-checker": 3.000000106112566e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 1.8000000636675395e-05, + "conditional-to-select": 4.999999873689376e-06, + "config-lowering": 2.5999999706982635e-05, + "constant_folding": 7.999999979801942e-06, + "cse": 1.2000000424450263e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.999999989900971e-06, + "emit-offloaded-dropout": 1.2999999853491317e-05, + "flatten-call-graph": 7.999999979801942e-06, + "fuse-send-recv": 2.099999983329326e-05, + "hilo::LegalizeAlias": 4.999999873689376e-06, + "hilo::NeuronInstCombine": 4.5000000682193786e-05, + "hilo::NeuronOpFusion": 1.700000029813964e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 2.099999983329326e-05, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 1.2999999853491317e-05, + "hilo::VerifyAliasing": 1.9999999949504854e-06, + "hlo-mac-count": 2.9999999242136255e-05, + "hlo-verifier": 0.00018000000272877514, + "legalize-ccops": 9.999999974752427e-07, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.999999989900971e-06, + "map-inline": 1.2000000424450263e-05, + "metadata-naming": 1.8000000636675395e-05, + "mlir::detail::OpToOpPassAdaptor": 9.999999747378752e-05, + "mlir::hlo::MhloToPyPenguin": 0.0009420000133104622, + "mlir::mhlo::LowerComplexExtraPass": 7.999999797903001e-05, + "mlir::mhlo::LowerComplexPass": 0.00015799999528098851, + "native-to-custom-softmax": 6.000000212225132e-06, + "native-to-custom-softmax-dx": 1.2999999853491317e-05, + "operand_upcaster": 1.4999999621068127e-05, + "post-par-pipe-begin": 1.9999999949504854e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0005130000063218176, + "replace-minimum-constant": 6.000000212225132e-06, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 4.8999998398358e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 9.000000318337698e-06, + "tuple-simplifier": 4.999999873689376e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 457.20416259765625, + "HloMacCount": 50465865728.0, + "Traffic": 220758560.0 + } + }, + "sg02": { + "compiletime": { + "CanonicalizeConv": 0.0002589999930933118, + "CanonicalizeForTensorizer": 1.2000000424450263e-05, + "Canonicalizer": 0.0003060000017285347, + "HoistCompute": 1.9999999949504854e-06, + "IdentifyCrossPassTensors": 2.4000000848900527e-05, + "MemcastMotion": 1.2999999853491317e-05, + "PenguinizeFunctions": 9.000000318337698e-06, + "PruneFunctions": 7.999999979801942e-06, + "RemoveOptimizationBarriers": 2.099999983329326e-05, + "ScatterMotion": 1.9999999949504854e-06, + "TensorizerLegalizationPass": 4.999999873689376e-06, + "VerifySupportedOps": 1.1000000085914508e-05, + "algsimp": 5.900000178371556e-05, + "batchnorm_expander": 1.2000000424450263e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 1.1000000085914508e-05, + "canonicalize-boundary-marker": 4.999999873689376e-06, + "collective-stream-id-checker": 3.000000106112566e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 4.099999932805076e-05, + "conditional-to-select": 6.000000212225132e-06, + "config-lowering": 2.5999999706982635e-05, + "constant_folding": 9.000000318337698e-06, + "cse": 1.1000000085914508e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.999999989900971e-06, + "emit-offloaded-dropout": 1.2999999853491317e-05, + "flatten-call-graph": 1.1000000085914508e-05, + "fuse-send-recv": 1.8000000636675395e-05, + "hilo::LegalizeAlias": 1.9999999949504854e-06, + "hilo::NeuronInstCombine": 5.8000001445179805e-05, + "hilo::NeuronOpFusion": 1.9999999949504854e-06, + "hilo::ReplaceTokenTypeWithU8Pass": 9.000000318337698e-06, + "hilo::ScheduleFusion": 3.999999989900971e-06, + "hilo::SixtyFourHack": 3.899999865097925e-05, + "hilo::VerifyAliasing": 9.999999974752427e-07, + "hlo-mac-count": 0.00016799999866634607, + "hlo-verifier": 0.00015900000289548188, + "legalize-ccops": 9.999999974752427e-07, + "legalize-compare": 3.000000106112566e-06, + "lower-argminmax-custom-call": 3.000000106112566e-06, + "map-inline": 1.2000000424450263e-05, + "metadata-naming": 1.700000029813964e-05, + "mlir::detail::OpToOpPassAdaptor": 2.2000000171829015e-05, + "mlir::hlo::MhloToPyPenguin": 0.0008549999911338091, + "mlir::mhlo::LowerComplexExtraPass": 8.800000068731606e-05, + "mlir::mhlo::LowerComplexPass": 0.00015999999595806003, + "native-to-custom-softmax": 4.999999873689376e-06, + "native-to-custom-softmax-dx": 1.9999999494757503e-05, + "operand_upcaster": 1.2000000424450263e-05, + "post-par-pipe-begin": 1.9999999949504854e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0005009999731555581, + "replace-minimum-constant": 9.000000318337698e-06, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 4.3000000005122274e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 4.8000001697801054e-05, + "tuple-simplifier": 4.999999873689376e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 99.1578140258789, + "HloMacCount": 38965870592.0, + "Traffic": 785936448.0 + } + } +} \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk2/graph.neff b/context_encoding_model/_tp0_bk2/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..5c4104700a702e1ce4f095214871287974a27fc4 --- /dev/null +++ b/context_encoding_model/_tp0_bk2/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7e216fd8f0f2acfef59524e7cdb4ead506b2c17c584ce45dd222cd4dc4e3f4f +size 1987584 diff --git a/context_encoding_model/_tp0_bk2/log-neuron-cc.txt b/context_encoding_model/_tp0_bk2/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..f06446f043e2cb0bb7cab2f871a9e1bb5c69b032 --- /dev/null +++ b/context_encoding_model/_tp0_bk2/log-neuron-cc.txt @@ -0,0 +1,5210 @@ +2025-08-07T13:53:50Z INFO 47776 [root]: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb --output /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=/home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/log-neuron-cc.txt --verbose=35 +2025-08-07T13:53:51Z INFO 47776 [root]: NeuronX Compiler version 2.20.9961.0+0acef03a Python version 3.10.12 HWM version 2.20.0.9961+0acef03a NumPy version 1.26.4 Running on AMI ami-040348201d80b58ad Running in region usw2-az4 +2025-08-07T13:53:51Z INFO 47910 [root]: XLA detected +2025-08-07T13:53:51Z INFO 47910 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-08-07T13:53:51Z INFO 47910 [root]: Intermediate files stored in /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/neuronxcc-_bze9vv6, output in /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2 +2025-08-07T13:53:51Z INFO 47910 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-08-07T13:53:51Z INFO 47910 [pipeline.Pipeline.0]: Processing input #0 +2025-08-07T13:53:51Z INFO 47910 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-08-07T13:53:51Z INFO 47910 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-08-07T13:53:51Z INFO 47910 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-08-07T13:53:51Z INFO 47910 [job.HLOToTensorizer.0]: Processing input #0 +2025-08-07T13:53:51Z INFO 47910 [job.HLOToTensorizer.0]: IR signature: 970c5138d61d773fc00bacb9090fbc05a05573925b8d91068006c211596d3f78 for model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb +2025-08-07T13:53:51Z INFO 47910 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --layers-per-module=1 --partition --emit-tensor-level-dropout-ops --modular-flow-mac-threshold=10 --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-08-07T13:53:51Z INFO 47910 [job.HLOToTensorizer.0]: DEBUG: needsModular_PreSplit? Yes. macCnt 1817082363904 threshold 4398046511104 num non-trivial Ops 3871 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 38 + +Pre-Partition Pre-Opt Histogram: +total HLO instructions: 10617 + reshape 2091 19.69% ################################################################ + broadcast 1731 16.30% #################################################### + convert 1281 12.07% ####################################### + transpose 1268 11.94% ###################################### + constant 815 7.68% ######################## + parameter 475 4.47% ############## + slice 445 4.19% ############# + add 365 3.44% ########### + multiply 327 3.08% ########## + dot 326 3.07% ######### + get-tuple-element 295 2.78% ######### + select 255 2.40% ####### + compare 222 2.09% ###### + call 186 1.75% ##### + concatenate 148 1.39% #### + tuple 73 0.69% ## + scatter 73 0.69% ## + negate 72 0.68% ## + all-reduce 72 0.68% ## + custom-call 38 0.36% # + divide 37 0.35% # + iota 7 0.07% + gather 6 0.06% + all-gather 3 0.03% + reduce 3 0.03% + sine 1 0.01% + cosine 1 0.01% + maximum 1 0.01% + +INFO: IoStatistics: total inputs: 475 +INFO: IoStatistics: total outputs: 73 +INFO: IoStatistics: total passthrough tensors: 0 +INFO: IoStatistics: total outputs read from: 0 +INFO: IoStatistics: total redundant outputs: 0 +INFO: IoStatistics: total ifmap size (KiB): 8072798 +INFO: IoStatistics: total ofmap size (KiB): 73728 +INFO: IoStatistics: total must-alias size (KiB): 73728 +INFO: IoStatistics: total may-alias size (KiB): 0 +INFO: HloMacCount has found 1817082331136 +INFO: Traffic has found 8576018605 +INFO: AIF 423.76 + +Pre-Partition Post-Op Histogram: +total HLO instructions: 6623 + reshape 1424 21.50% ################################################################ + convert 992 14.98% ############################################ + transpose 941 14.21% ########################################## + constant 523 7.90% ####################### + parameter 475 7.17% ##################### + broadcast 410 6.19% ################## + dot 325 4.91% ############## + custom-call 223 3.37% ########## + multiply 219 3.31% ######### + add 219 3.31% ######### + get-tuple-element 151 2.28% ###### + slice 147 2.22% ###### + concatenate 146 2.20% ###### + select 110 1.66% #### + compare 76 1.15% ### + scatter 73 1.10% ### + negate 72 1.09% ### + all-reduce 72 1.09% ### + gather 6 0.09% + iota 5 0.08% + all-gather 3 0.05% + reduce 3 0.05% + pad 2 0.03% + sine 1 0.02% + divide 1 0.02% + tuple 1 0.02% + maximum 1 0.02% + rng 1 0.02% + cosine 1 0.02% + +INFO: Found compute bound graph +DEBUG: needsModular_PreSplit? Yes. macCnt 1817082331136 threshold 4398046511104 num non-trivial Ops 2702 +DEBUG: transformer model +INFO: Partitioner configs:ModularFlow BO LBL SA ConcatGraphs: 1 MaxDisj:2 MaxSep:4 LPM:1 +INFO: Markers NOT detected +Potential split-points stats: #CC 75 #AR 72 #AG 3 #BN 0 nClamp 0 +DEBUG: needsModular_SplitFinder? Yes. +ModuleSplitter initial partitioning... #parts 75 +ModuleSplitter initial partitioning... Done. +INFO: Num of unique Module Definitions: 6 +DEBUG: DefMap: 0 1 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 73 74 +New disjoint wave: start 2 len 70 NumReps: 35 macs 1766305300480 +INFO: Attempting to identify and split optimizer at end +First non-zero-mac/used part from the end is 73 +Not enough zero-mac parts. skip +INFO: Optimized 0 all-reduce split instructions +INFO: Number of splitPoints: 37 +ModuleSplitter initial partitioning... #parts 37 +ModuleSplitter initial partitioning... Done. +Remat: gather-iota 0 matches, 0 ops rematted +INFO: Alias legality verification of partitions PASSED. +INFO: No transposable_weight_idx attrs found +INFO: Peak intermediate memory demand is at Partition 1. Num live intermediates at peak is 9 and memory usage is 17301508 bytes. +INFO: Please refer to LiveRangeReport_PostHloPart.txt for detailed intermediate lifetime info. +DEBUG: DefMap: 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 36 +Wrote HLO netlist to hlo_netlist.json +Wrote graph partitions in debug_info_hlo_partitions.json +Processing partition 0 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 0 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 11811160064 +INFO: Traffic has found 685798182 +INFO: AIF 34.45 +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call dot gather get-tuple-element iota multiply negate parameter reshape scatter select sine slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 1 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 0 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 50465865728 +INFO: Traffic has found 220758564 +INFO: AIF 457.20 +HLO Ops used in computation: add all-reduce broadcast compare concatenate constant convert custom-call dot get-tuple-element multiply negate parameter reshape scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 2 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 0 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 38965870592 +INFO: Traffic has found 785936459 +INFO: AIF 99.16 +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert custom-call divide dot gather get-tuple-element iota maximum multiply pad parameter reduce reshape rng scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass + +2025-08-07T13:53:51Z INFO 47910 [job.HLOToTensorizer.0]: IR signature: bee1ade3fa0c95d2cdfc26fd8886180bbceebc1c6de39a7400598fd9ad8e3c4e for sg0000/HLOToTensorizer +2025-08-07T13:53:51Z INFO 47910 [job.HLOToTensorizer.0]: IR signature: 3069c52b32b588de7846d6d2f644b8ddadbbc15eca2cda29e7c4a87679dd1cc9 for sg0001/HLOToTensorizer +2025-08-07T13:53:51Z INFO 47910 [job.HLOToTensorizer.0]: IR signature: 9bf15b1eea615e92613371ecfdce9a74d2fb0753562e7669e104a9ec5306c855 for sg0002/HLOToTensorizer +2025-08-07T13:53:51Z INFO 47910 [job.HLOToTensorizer.0]: Job #0 finished +2025-08-07T13:53:51Z INFO 47910 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-08-07T13:53:51Z INFO 47910 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-08-07T13:53:51Z INFO 47910 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-08-07T13:53:51Z INFO 47910 [job.Frontend.0]: Processing input #0 +2025-08-07T13:53:51Z INFO 47910 [job.Frontend.0]: Start model loading +2025-08-07T13:53:51Z INFO 47910 [job.Frontend.0]: Start tensorization +2025-08-07T13:53:52Z INFO 47910 [job.Frontend.0]: Num jobs: 128 +2025-08-07T13:53:52Z USER 47910 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-08-07T13:53:52Z INFO 47910 [Tensorizer]: Max workers: 3 +2025-08-07T13:53:52Z INFO 49119 [Tensorizer]: Building model from Penguin script "penguin.py.000000"... +2025-08-07T13:53:52Z INFO 49120 [Tensorizer]: Building model from Penguin script "penguin.py.000001"... +2025-08-07T13:53:52Z INFO 49121 [Tensorizer]: Building model from Penguin script "penguin.py.000002"... +2025-08-07T13:53:52Z INFO 49120 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-08-07T13:53:52Z INFO 49119 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49121 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.012 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.011 seconds +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.005 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.034 seconds +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-08-07T13:53:52Z INFO 49121 [sg0002/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.006 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.188 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.007 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.013 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.007 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.029 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.007 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.004 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.005 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.004 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.005 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.015 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.018 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.004 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49120 [sg0001/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.004 seconds +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:52Z INFO 49119 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-08-07T13:53:53Z INFO 49120 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.033 seconds +2025-08-07T13:53:53Z INFO 49120 [sg0001/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49120 [sg0001/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.029 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49120 [sg0001/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49120 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:53Z INFO 49120 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:53Z INFO 49120 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/LegalizeCCOpLayout]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.009 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.085 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.013 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.110 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.029 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.020 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/LateLowerTensorOp]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.029 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.013 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.026 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.008 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.015 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Rematerialization]: Running Rematerialization +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Rematerialization]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Rematerialization]: Rematerialization finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.010 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.038 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.010 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.007 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.018 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.006 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/TCTransform]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.008 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49121 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.031 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:53Z INFO 49119 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.006 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49119 [Tensorizer]: After optimization: 26 statements +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=4194304 is not above min_allgather_tile_size_in_bytes=8388608` +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/TileCCOps]: in bfloat16 (4096, 512) %'all_gather.1' = AllGatherOp-46 AllGather_add(bfloat16 (2048, 512) %'transpose.1', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((4096, 512), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.47 | hlo_id: 19 | , id = 46 +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/TileCCOps]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 49120 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:54Z INFO 49120 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49120 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.012 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49120 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.009 seconds +2025-08-07T13:53:54Z INFO 49120 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 1.062 seconds +2025-08-07T13:53:54Z INFO 49120 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/PadElimination]: Running PadElimination +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/PadElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/PadElimination]: PadElimination finished after 0.000 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49120 [sg0001/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49120 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.105 seconds +2025-08-07T13:53:54Z INFO 49120 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49120 [sg0001/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.006 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/Recompute]: Running Recompute +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/Recompute]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49121 [Tensorizer]: After optimization: 38 statements +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/MutateDataType]: Running MutateDataType +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/MutateDataType]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/MutateDataType]: MutateDataType finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/TileCCOps]: Running TileCCOps +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/TileCCOps]: in float32 (512,) %'all_gather.2' = AllGatherOp-149 AllGather_add(float32 (256,) %'add.11', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.8843 | hlo_id: 101 | , id = 149 +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=2048 is not above min_allgather_tile_size_in_bytes=8388608` +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/TileCCOps]: in uint32 (512,) %'all_gather.3' = AllGatherOp-165 AllGather_add(uint32 (256,) %'add.12', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.8978 | hlo_id: 110 | , id = 165 +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/TileCCOps]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.008 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.011 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/TileCCOps]: TileCCOps finished after 0.007 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.007 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.009 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.018 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.014 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.009 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.041 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.007 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.106 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.027 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.107 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49120 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.029 seconds +2025-08-07T13:53:54Z INFO 49120 [sg0001/Tensorizer/Rematerialization]: Running Rematerialization +2025-08-07T13:53:54Z INFO 49120 [sg0001/Tensorizer/Rematerialization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.016 seconds +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-08-07T13:53:54Z INFO 49120 [sg0001/Tensorizer/Rematerialization]: Rematerialization finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49120 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 49120 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49120 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.009 seconds +2025-08-07T13:53:54Z INFO 49120 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 49120 [sg0001/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.106 seconds +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-08-07T13:53:54Z INFO 49119 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-08-07T13:53:54Z INFO 49121 [sg0002/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.054 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.081 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PGTiling]: Running PGTiling +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 585 of IO tensor {'CrossPassTensor': ''}bfloat16 %input471|NC|(128, 32) is not sorted, index list (w/ AG ids): [(16, 'AG49'), (13, 'AG50')] +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 586 of IO tensor {'CrossPassTensor': ''}bfloat16 %input472|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(16, 'AG49'), (13, 'AG50')] +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 587 of IO tensor {'CrossPassTensor': ''}bfloat16 %input470|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(16, 'AG49'), (13, 'AG50')] +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 588 of IO tensor {'CrossPassTensor': ''}bfloat16 %input469(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(10, 'AG54'), (15, 'AG52'), (11, 'AG53')] +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 589 of IO tensor {'CrossPassTensor': ''}bfloat16 %input474|NC|(128, 32) is not sorted, index list (w/ AG ids): [(16, 'AG49'), (13, 'AG50')] +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 540 of IO tensor {'CrossPassTensor': ''}bfloat16 %input473|NC|(75968, 32, 128) is not sorted, index list (w/ AG ids): [(14, 'AG59'), (13, 'AG50')] +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.012 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.018 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.041 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.000 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.030 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PGTiling]: PGTiling finished after 0.149 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.019 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.294 seconds +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.018 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 0.545 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 19008: transpose_128x128 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 19008: matmul_128x128x1 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x512 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x512 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x512 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 48: simd128x512 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 32: rmsnorm128x512x128 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 32: simd128x512 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 32: rmsnorm128x512x128 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 2: indirect_load128x1 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x1 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x1 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x1 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 1: reduce512x1x1 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x512 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 1: reduce512x1x1 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingBottleneck]: 1: indirect_load32x128 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.007 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.008 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.007 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.378 seconds +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.023 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.001 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/PadElimination]: Running PadElimination +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/PadElimination]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/PadElimination]: PadElimination finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.007 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 702 of IO tensor {'CrossPassTensor': ''}bfloat16 %input79|NC|(128, 32) is not sorted, index list (w/ AG ids): [(16, 'AG82'), (11, 'AG83')] +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 703 of IO tensor {'CrossPassTensor': ''}bfloat16 %input83(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(16, 'AG82'), (11, 'AG83')] +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 704 of IO tensor {'CrossPassTensor': ''}bfloat16 %input81(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(16, 'AG82'), (11, 'AG83')] +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 705 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(16, 'AG82'), (11, 'AG83')] +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 459 of IO tensor {'CrossPassTensor': ''}bfloat16 %input77(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(8, 'AG94'), (6, 'AG90'), (7, 'AG89'), (9, 'AG93'), (10, 'AG92')] +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 569 of IO tensor {'IntermediateTensor': ''}bfloat16 %intermediate1(512, 32, 128) is not sorted, index list (w/ AG ids): [(12, 'AG84'), (11, 'AG83')] +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.078 seconds +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.009 seconds +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.034 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Recompute]: Running Recompute +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Recompute]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 49120 [Tensorizer]: After optimization: 25 statements +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/MutateDataType]: Running MutateDataType +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/MutateDataType]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/MutateDataType]: MutateDataType finished after 0.001 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.006 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/TileCCOps]: Running TileCCOps +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/TileCCOps]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/TileCCOps]: TileCCOps finished after 0.006 seconds +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:55Z INFO 49120 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 49119 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.073 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 19008: transpose_128x128 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 19008: matmul_128x128x1 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x512 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x512 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x512 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 594: transpose_128x1 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 384: dma128x512 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 96: dma128x2048 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 96: dma128x2048 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 48: simd128x512 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x512x128 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: simd128x512 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x512x128 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 16: dma128x1024 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 2: indirect_load128x1 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 1: dma128x32 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 1: simd1x1 +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:55Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.006 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.013 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.064 seconds +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 0.473 seconds +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.013 seconds +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.010 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.010 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.005 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.011 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.008 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.003 seconds +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.024 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.005 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.031 seconds +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 1.533 seconds +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 512: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 512: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 128: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 128: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 64: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 64: softmax512x1x128 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 64: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 32: simd128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 32: rmsnorm128x512x128 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 16: indirect_load128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 16: rmsnorm128x512x128 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 16: simd128x256 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 16: simd128x256 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 16: simd128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.010 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.013 seconds +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.005 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.012 seconds +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.045 seconds +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.011 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/InferInitValue]: Running InferInitValue +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.008 seconds +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/InferInitValue]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.037 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/InferInitValue]: InferInitValue finished after 0.025 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.001 seconds +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.008 seconds +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.011 seconds +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.007 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/SimplifyTensor]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.008 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.252 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.005 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/SundaISel]: Running SundaISel +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.031 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/SundaISel]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.098 seconds +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: dma128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: softmax512x1x128 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x512x128 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x2048 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: indirect_load128x512 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: rmsnorm128x512x128 +2025-08-07T13:53:56Z INFO 49119 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/SundaISel]: SundaISel finished after 0.042 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.000 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.020 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.092 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.006 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.008 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.007 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.176 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.003 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.005 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.006 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/PGTiling]: Running PGTiling +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.009 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 669 of IO tensor {'CrossPassTensor': ''}bfloat16 %input86|NC|(128, 32) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 670 of IO tensor {'CrossPassTensor': ''}bfloat16 %input87|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 671 of IO tensor {'CrossPassTensor': ''}bfloat16 %input85|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 672 of IO tensor {'CrossPassTensor': ''}bfloat16 %input84(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(7, 'AG90'), (14, 'AG88'), (8, 'AG89')] +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 673 of IO tensor {'CrossPassTensor': ''}bfloat16 %input90|NC|(128, 32) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 674 of IO tensor {'CrossPassTensor': ''}bfloat16 %input94(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 679 of IO tensor {'CrossPassTensor': ''}bfloat16 %input92(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 680 of IO tensor {'CrossPassTensor': ''}bfloat16 %input89|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(15, 'AG85'), (12, 'AG86')] +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 447 of IO tensor {'CrossPassTensor': ''}bfloat16 %input88(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(2, 'AG100'), (0, 'AG96'), (1, 'AG95'), (3, 'AG99'), (4, 'AG98')] +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.033 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.005 seconds +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-08-07T13:53:56Z INFO 49120 [sg0001/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.025 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.003 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.001 seconds +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/DeConcat]: Running DeConcat +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/DeConcat]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/DeConcat]: DeConcat finished after 0.001 seconds +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.001 seconds +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.007 seconds +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.007 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.045 seconds +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.000 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.008 seconds +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.007 seconds +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.005 seconds +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.012 seconds +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.001 seconds +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.107 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PGTiling]: PGTiling finished after 0.398 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.011 seconds +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.022 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.009 seconds +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/SpillPSum]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.010 seconds +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/SpillPSum]: SpillPSum finished after 0.012 seconds +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.011 seconds +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.034 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 1.143 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 512: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 512: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 128: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 128: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 64: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 64: softmax512x1x128 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 64: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 48: simd128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 32: rmsnorm128x512x128 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 32: simd128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 32: rmsnorm128x512x128 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 16: rmsnorm128x512x128 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingBottleneck]: 16: simd64x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.010 seconds +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.015 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.013 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.014 seconds +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.049 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.008 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 0.030 seconds +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.008 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.010 seconds +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.005 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.006 seconds +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.311 seconds +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.008 seconds +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/LegalizeType]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/LegalizeType]: LegalizeType finished after 0.012 seconds +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 0.046 seconds +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.000 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.022 seconds +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:57Z INFO 49119 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.035 seconds +2025-08-07T13:53:57Z INFO 49121 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.138 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 384: dma128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: dma128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 96: dma128x2048 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 96: dma128x2048 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: softmax512x1x128 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 48: simd128x512 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x512x128 +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.005 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.011 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.011 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.012 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.011 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 49120 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.006 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/InferInitValue]: Running InferInitValue +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.004 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/InferInitValue]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.018 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/InferInitValue]: InferInitValue finished after 0.036 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.277 seconds +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.011 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/SimplifyTensor]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.008 seconds +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.006 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/SundaISel]: Running SundaISel +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.011 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/SundaISel]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.012 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.076 seconds +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/SundaISel]: SundaISel finished after 0.042 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.000 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.021 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.013 seconds +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.013 seconds +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.045 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.021 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.004 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.013 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.007 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.006 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.121 seconds +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.014 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.011 seconds +2025-08-07T13:53:58Z INFO 49121 [sg0002/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.056 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.004 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.013 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.020 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/DeConcat]: Running DeConcat +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/DeConcat]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/DeConcat]: DeConcat finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.062 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.027 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.013 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.005 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.016 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.007 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.011 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.093 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.020 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.017 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.016 seconds +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.007 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49119 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.016 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.005 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.012 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.019 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 49120 [sg0001/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.030 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/SpillPSum]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/SpillPSum]: SpillPSum finished after 0.019 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.010 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.036 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.001 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/LegalizeType]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/LegalizeType]: LegalizeType finished after 0.005 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.037 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.008 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.003 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.032 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/WeightCoalescing]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.003 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.039 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.016 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.010 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.006 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.003 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.003 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.007 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DataStreaming]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DataStreaming]: DataStreaming finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.010 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.001 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DataStreaming]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.005 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.147 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.005 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 1.041ms (48.000MiB, est bw: 48.348GB/s, 55.242% of tot. time) for bfloat16<128 x 128> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 24, 128, 512) %'input84_local_910'[i15_0_0_916_0_0_1166,i15_0_0_916_0_1_1166,i15_0_0_1,c1_904,c2_905,i0.128,i1.128+128p_1342] = load bfloat16<128 x 128> {'CrossPassTensor': ''}bfloat16 (8, 4, 2, 128, 24, 128) %'input84'[4i15_0_0_916_0_0_1166+2i15_0_0_916_0_1_1166+i15_0_0_1,p_1342,c1_904,i0.128,c2_905,i1.128] # id=1077, src_id=None, , instances=1536 # dl = tensor_op_name: _dot.6 | hlo_id: 49 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 244.771us (48.000MiB, est bw: 205.627GB/s, 12.989% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 24, 2, 128, 2048) %'input85_local_890'[i10_0_0,i10_0_1,c2_885,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 24, 128, 4096) %'input85'[i10_0_0,i10_0_1,i0.128,i1.2048+2048c2_885] # id=1068, src_id=None, , instances=96 # dl = tensor_op_name: _dot.4 | hlo_id: 39 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 244.771us (48.000MiB, est bw: 205.627GB/s, 12.989% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 24, 2, 128, 2048) %'input87_local_900'[i12_0_0,3i12_0_1_0+i12_0_1_1,c2_895,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 24, 128, 4096) %'input87'[i12_0_0,3i12_0_1_0+i12_0_1_1,i0.128,i1.2048+2048c2_895] # id=1071, src_id=None, , instances=96 # dl = tensor_op_name: _dot.5 | hlo_id: 30 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 99.175us (16.000MiB, est bw: 169.167GB/s, 5.263% of tot. time) for bfloat16<128 x 1024> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 4, 512) %'input88_local_992'[i115_0_0_0_998_0_0_1167,i115_0_0_0_998_0_1_1167,i115_0_0_0_1,c1_985,i0.128,i3.4,i1.128+128i2.2+256p_1362] = load bfloat16<128 x 1024> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 4, 4, 2, 128) %'input88'[4i115_0_0_0_998_0_0_1167+2i115_0_0_0_998_0_1_1167+i115_0_0_0_1,p_1362,i0.128,c1_985,i3.4,i2.2,i1.128] # id=1146, src_id=None, , instances=64 # dl = tensor_op_name: _dot.10 | hlo_id: 165 | [[i0.128];[i1.128, i2.2, i3.4]] -> [[i0.128];[i1.128, i2.2, i3.4]] +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 4.376% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (4, 4, 2, 128, 2048) %'input94_local_931'[i41_0,i41_1,c2_926,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (4, 4, 128, 4096) %'input94'[i41_0,i41_1,i0.128,i1.2048+2048c2_926] # id=1091, src_id=None, , instances=32 # dl = tensor_op_name: _dot.9 | hlo_id: 67 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 1.146% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (4, 2, 128, 2048) %'838.1302'[T_i0_0_1645,T_i1_0_1641_1645,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (4, 128, 4096) %'add.4'[T_i0_0_1645,i0.128,2048T_i1_0_1641_1645+i1.2048] # id=1168, src_id=None, , instances=8 # dl = tensor_op_name: add.4_pftranspose_838 | hlo_id: 17 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 1.146% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 4, 128, 2048) %'842.1307'[T_i17_0_850_0_1646,2T_i0_0_0_1642_1646+T_i0_0_1_1642_1646,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (2097152,) %'all_reduce.1-buffer-1678'[2048T_i17_0_850_0_1646+4096i0.128+1048576T_i0_0_0_1642_1646+i1.2048+524288T_i0_0_1_1642_1646] # id=1177, src_id=None, , instances=8 # dl = tensor_op_name: all_reduce.1_pftranspose_842 | hlo_id: 52 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 1.146% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (4, 2, 128, 2048) %'input92_local_945'[i52_0_0_1643,c1_940_1643,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (4, 128, 4096) %'input92'[i52_0_0_1643,i0.128,2048c1_940_1643+i1.2048] # id=1112, src_id=None, , instances=8 # dl = tensor_op_name: _dot.8 | hlo_id: 102 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 1.146% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[1] bfloat16 (2, 128, 16, 512) %'input89_local_981'[c0_976_0,i0.128,i2.16,128p_1351+i1.128] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (4, 128, 32, 128) %'input89'[p_1351,i0.128,16c0_976_0+i2.16,i1.128] # id=1141, src_id=None, , instances=8 # dl = tensor_op_name: _dot.7 | hlo_id: 151 | [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 15.912us (4.000MiB, est bw: 263.593GB/s, 0.844% of tot. time) for bfloat16<128 x 1024> non_local bfloat16 (2097152,) %'dot.7-buffer-1676'[2048i15_0_0_916_0_0_1166+4096i0.128+1024i15_0_0_916_0_1_1166+i1.1024+524288i16_0_916_1166] = store bfloat16<128 x 1024> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 1024) %917[i15_0_0_916_0_0_1166,i15_0_0_916_0_1_1166,i16_0_916_1166,i0.128,i1.1024] # id=1080, src_id=None, , instances=16 # dl = tensor_op_name: _dot.6 | hlo_id: 49 | [[i0.128];[i1.1024]] -> [[i0.128];[i1.1024]] +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/SimplifyNeuronTensor]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/OptimizeNKIKernels]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 1.059 seconds +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.006 seconds +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.202 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/DataStreaming]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.006 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/DataStreaming]: DataStreaming finished after 0.039 seconds +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.005 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.041 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 99.175us (16.000MiB, est bw: 169.167GB/s, 25.983% of tot. time) for bfloat16<128 x 1024> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 4, 512) %'input77_local_1236'[i122_0_0_0_1242_0_0_1468,i122_0_0_0_1242_0_1_1468,i122_0_0_0_1,c1_1229,i0.128,i3.4,i1.128+128i2.2+256p_1748] = load bfloat16<128 x 1024> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 4, 4, 2, 128) %'input77'[4i122_0_0_0_1242_0_0_1468+2i122_0_0_0_1242_0_1_1468+i122_0_0_0_1,p_1748,i0.128,c1_1229,i3.4,i2.2,i1.128] # id=1452, src_id=None, , instances=64 # dl = tensor_op_name: _dot.3 | hlo_id: 145 | [[i0.128];[i1.128, i2.2, i3.4]] -> [[i0.128];[i1.128, i2.2, i3.4]] +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 21.603% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (4, 4, 2, 128, 2048) %'input83_local_1198'[i48_0_1467,i32_0_0_1,c2_1193,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (4, 4, 128, 4096) %'input83'[i48_0_1467,i32_0_0_1,i0.128,i1.2048+2048c2_1193] # id=1348, src_id=None, , instances=32 # dl = tensor_op_name: _dot.2 | hlo_id: 34 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 34.173us (4.000MiB, est bw: 122.737GB/s, 8.953% of tot. time) for bfloat16<128 x 512> TongaSB partitions[2] bfloat16 (2, 16, 128, 512) %'intermediate1_pftranspose_1121'[T_i1_0_1125_0_1997,i1_0_1_1995_1997,i0.128,i1.512] = load bfloat16<128 x 512> non_local bfloat16 (32, 128, 512) %'all_gather.1'[16T_i1_0_1125_0_1997+i1_0_1_1995_1997,i0.128,i1.512] # id=1307, src_id=None, , instances=32 # dl = tensor_op_name: UnnamedModule | hlo_id: 1 | [[i0.128];[i1.512]] -> [[i0.128];[i1.512]] +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 34.173us (4.000MiB, est bw: 122.737GB/s, 8.953% of tot. time) for bfloat16<128 x 512> TongaSB partitions[2] bfloat16 (2, 16, 128, 512) %'custom-call.226.1540'[i29_0_1183_0,i29_0_1183_1,i0.128,i1.512] = load bfloat16<128 x 512> non_local bfloat16 (32, 128, 512) %'all_gather.1'[16i29_0_1183_0+i29_0_1183_1,i0.128,i1.512] # id=1343, src_id=None, , instances=32 # dl = tensor_op_name: _custom-call.226 | hlo_id: 27 | [[i0.128];[i1.512]] -> [[i0.128];[i1.512]] +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 5.656% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[1] bfloat16 (2, 128, 16, 512) %'input78_local_1225'[c0_1220_0,i0.128,i2.16,128p_1663+i1.128] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (4, 128, 32, 128) %'input78'[p_1663,i0.128,16c0_1220_0+i2.16,i1.128] # id=1447, src_id=None, , instances=8 # dl = tensor_op_name: _dot | hlo_id: 131 | [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 5.656% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (4, 2, 128, 2048) %'input81_local_1212'[i120_0_1998,c1_1207_1996_1998,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (4, 128, 4096) %'input81'[i120_0_1998,i0.128,2048c1_1207_1996_1998+i1.2048] # id=1393, src_id=None, , instances=8 # dl = tensor_op_name: _dot.1 | hlo_id: 82 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 15.912us (4.000MiB, est bw: 263.593GB/s, 4.169% of tot. time) for bfloat16<128 x 1024> non_local bfloat16 (2097152,) %'dot.4-buffer-2025'[2048i122_0_0_0_1242_0_0_1468+4096i0.128+1024i122_0_0_0_1242_0_1_1468+i1.1024+524288i123_0_1242_1468] = store bfloat16<128 x 1024> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 1024) %1243[i122_0_0_0_1242_0_0_1468,i122_0_0_0_1242_0_1_1468,i123_0_1242_1468,i0.128,i1.1024] # id=1455, src_id=None, , instances=16 # dl = tensor_op_name: _dot.3 | hlo_id: 145 | [[i0.128];[i1.1024]] -> [[i0.128];[i1.1024]] +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 13.416us (4.000MiB, est bw: 312.630GB/s, 3.515% of tot. time) for bfloat16<128 x 2048> {'IntermediateTensor': ''}bfloat16 (4, 128, 4096) %'intermediate1'(init=0.0)[2T_i0_1125_0_0+T_i0_1125_0_1,i0.128,2048T_i1_0_1125_0+i1.2048] = store bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 4, 128, 2048) %'1121.1754'[T_i1_0_1125_0,2T_i0_1125_0_0+T_i0_1125_0_1,i0.128,i1.2048] # id=1482, src_id=None, , instances=8 # dl = tensor_op_name: intermediate1_pftranspose_1121 | hlo_id: 1 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 11.445us (2.000MiB, est bw: 183.243GB/s, 2.998% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[1] bfloat16 (4, 128, 2048) %'transpose.1_pftranspose_1116'[i13_0,i0.128,i1.2048] = indirect_load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (151936, 2048) %'input76'[i0.128,i1.2048] generic generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[0] int32 (128, 4, 1) %'input0_local_1155'[i0.128,i13_0,0] # id=1305, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=4 # dl = tensor_op_name: _gather.41 | hlo_id: 16 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 11.114us (2.000MiB, est bw: 188.702GB/s, 2.912% of tot. time) for bfloat16<128 x 512> non_local bfloat16 (4, 4, 128, 512) %'transpose.1'[2T_i12_0_1120_0+T_i12_0_1120_1,T_i12_1_1120_0,i0.128,i1.512] = store bfloat16<128 x 512> TongaSB partitions[2] bfloat16 (4, 4, 128, 512) %'1116.1752'[2T_i12_0_1120_0+T_i12_0_1120_1,T_i12_1_1120_0,i0.128,i1.512] # id=1469, src_id=None, , instances=16 # dl = tensor_op_name: transpose.1_pftranspose_1116 | hlo_id: 16 | [[i0.128];[i1.512]] -> [[i0.128];[i1.512]] +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.018 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.006 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.003 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.030 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.005 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49120 [sg0001/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.037 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.029 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.003 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-08-07T13:53:59Z INFO 49120 [Tensorizer]: BirCodeGen estimate #instances=9982 in sg0001 +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49120 [Tensorizer]: IR signature: b5c435074d5e1c109f799ae0f0b0aaed48cff1e17006399a288964acdab9fbda for nc00/sg0001/TensorizerBIR +2025-08-07T13:53:59Z INFO 49120 [Tensorizer]: Weights total number of bytes: 163840 +2025-08-07T13:53:59Z INFO 49120 [Tensorizer]: Successfully built model. +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.005 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.269 seconds +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.014 seconds +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49119 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.014 seconds +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.009 seconds +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 3.014ms (594.000MiB, est bw: 206.636GB/s, 61.508% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (594, 2, 128, 2048) %'698.1077'[i31_0,T_i1_0_2748,i0.128,i1.128+128i2.16] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (75968, 32, 128) %'input473'[128i31_0+i0.128,16T_i1_0_2748+i2.16,i1.128] # id=1076, src_id=None, , instances=1188 # dl = tensor_op_name: input473_pftranspose_698 | hlo_id: 90 | if -128i31_0-i0.128+75967 >= 0 [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 1.041ms (48.000MiB, est bw: 48.348GB/s, 21.243% of tot. time) for bfloat16<128 x 128> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 24, 128, 512) %'input469_local_771'[i15_0_0_777_0_0_1050,i15_0_0_777_0_1_1050,i15_0_0_1,c1_765,c2_766,i0.128,i1.128+128p_2158] = load bfloat16<128 x 128> {'CrossPassTensor': ''}bfloat16 (8, 4, 2, 128, 24, 128) %'input469'[4i15_0_0_777_0_0_1050+2i15_0_0_777_0_1_1050+i15_0_0_1,p_2158,c1_765,i0.128,c2_766,i1.128] # id=935, src_id=None, , instances=1536 # dl = tensor_op_name: _dot.256 | hlo_id: 59 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 244.771us (48.000MiB, est bw: 205.627GB/s, 4.995% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 24, 2, 128, 2048) %'input470_local_751'[i10_0_0,i10_0_1,c2_746,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 24, 128, 4096) %'input470'[i10_0_0,i10_0_1,i0.128,i1.2048+2048c2_746] # id=926, src_id=None, , instances=96 # dl = tensor_op_name: _dot.254 | hlo_id: 49 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 244.771us (48.000MiB, est bw: 205.627GB/s, 4.995% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 24, 2, 128, 2048) %'input472_local_761'[i12_0_0,i12_0_1,c2_756,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 24, 128, 4096) %'input472'[i12_0_0,i12_0_1,i0.128,i1.2048+2048c2_756] # id=929, src_id=None, , instances=96 # dl = tensor_op_name: _dot.255 | hlo_id: 40 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 191.807us (297.000KiB, est bw: 1.586GB/s, 3.914% of tot. time) for float32<1 x 128> {'no_delinear': '0'}non_local float32 (1, 75968) %'convert.59'[0,128i31_0+i0.128] = store float32<1 x 128> TongaSB partitions[1] float32 (594, 1, 128) %'dot.257.1087'[i31_0,0,i0.128] # id=1085, src_id=None, , instances=594 # dl = tensor_op_name: _dot.257 | hlo_id: 90 | if -128i31_0-i0.128+75967 >= 0 [[];[i0.128]] -> [[];[i0.128]] +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 22.647us (296.758KiB, est bw: 13.418GB/s, 0.462% of tot. time) for float32<1 x 15194> TongaSB partitions[1] float32 (5, 1, 15194) %'custom-call.411.1156'[i1,0,i0.15194] = load float32<1 x 15194> {'no_delinear': '0'}non_local float32 (1, 75968) %'convert.59'[15194i1+i0.15194] # id=1151, src_id=None, , instances=5 # dl = tensor_op_name: _custom-call.411 | hlo_id: 93 | if -15194i1-i0.15194+75967 >= 0 [[];[i0.15194]] -> [[];[i0.15194]] +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 0.441% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (4, 2, 128, 2048) %'702.2137'[T_i0_0_2749,T_i1_0_2746_2749,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (4, 128, 4096) %'add.9'[T_i0_0_2749,i0.128,2048T_i1_0_2746_2749+i1.2048] # id=1051, src_id=None, , instances=8 # dl = tensor_op_name: add.9_pftranspose_702 | hlo_id: 27 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 0.441% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (4, 2, 128, 2048) %'706.2142'[T_i0_0_2750,T_i1_0_2747_2750,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (2097152,) %'all_reduce.3-buffer-2764'[524288T_i0_0_2750+4096i0.128+2048T_i1_0_2747_2750+i1.2048] # id=1060, src_id=None, , instances=8 # dl = tensor_op_name: all_reduce.3_pftranspose_706 | hlo_id: 62 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 15.912us (4.000MiB, est bw: 263.593GB/s, 0.325% of tot. time) for bfloat16<128 x 1024> non_local bfloat16 (2097152,) %'dot.14-buffer-2762'[2048i15_0_0_777_0_0_1050+4096i0.128+1024i15_0_0_777_0_1_1050+i1.1024+524288i16_0_777_1050] = store bfloat16<128 x 1024> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 1024) %778[i15_0_0_777_0_0_1050,i15_0_0_777_0_1_1050,i16_0_777_1050,i0.128,i1.1024] # id=938, src_id=None, , instances=16 # dl = tensor_op_name: _dot.256 | hlo_id: 59 | [[i0.128];[i1.1024]] -> [[i0.128];[i1.1024]] +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 13.416us (4.000MiB, est bw: 312.630GB/s, 0.274% of tot. time) for bfloat16<128 x 2048> non_local bfloat16 (512, 32, 128) %'convert.57'[i0.128+256T_i20_714_0_0+128T_i20_714_0_1,16T_i19_0_714_0_1147+i2.4+4i3.4,i1.128] = store bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 4, 128, 4, 512) %'710.2552'[T_i19_0_714_0_1147,2T_i20_714_0_0+T_i20_714_0_1,i0.128,i3.4,i1.128+128i2.4] # id=1064, src_id=None, , instances=8 # dl = tensor_op_name: convert.57_pftranspose_710 | hlo_id: 70 | [[i0.128];[i1.128, i2.4, i3.4]] -> [[i0.128];[i1.128, i2.4, i3.4]] +2025-08-07T13:53:59Z INFO 49121 [sg0002/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49119 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.053 seconds +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.012 seconds +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49119 [Tensorizer]: BirCodeGen estimate #instances=4059 in sg0000 +2025-08-07T13:54:00Z INFO 49119 [Tensorizer]: IR signature: e39ad22ce8afdef7ecc6fdf4828eaab9581e10801d2260a08a280d875cd1c6ae for nc00/sg0000/TensorizerBIR +2025-08-07T13:54:00Z INFO 49119 [Tensorizer]: Weights total number of bytes: 164096 +2025-08-07T13:54:00Z INFO 49119 [Tensorizer]: Successfully built model. +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.004 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.537 seconds +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.024 seconds +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-08-07T13:54:00Z WARNING 49121 [sg0002/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 50.73 percent of all matmul computation +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.012 seconds +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.108 seconds +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.013 seconds +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.095 seconds +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49121 [sg0002/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-08-07T13:54:01Z INFO 49121 [sg0002/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49121 [sg0002/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.453 seconds +2025-08-07T13:54:01Z INFO 49121 [Tensorizer]: BirCodeGen estimate #instances=100944 in sg0002 +2025-08-07T13:54:01Z INFO 49121 [Tensorizer]: IR signature: 68be87f5c8757415b3f3fa01ecb429f46a93147b8715d42639b03f21b24c9acb for nc00/sg0002/TensorizerBIR +2025-08-07T13:54:01Z INFO 49121 [Tensorizer]: Weights total number of bytes: 135176 +2025-08-07T13:54:01Z INFO 49121 [Tensorizer]: Successfully built model. +2025-08-07T13:54:01Z USER 47910 [root/Tensorizer/Tensorizer]: Tensorizer finished after 9.388 seconds +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: End tensorization +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input76 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input0 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input79 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input83 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input82 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input1 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input81 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input80 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input78 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input77 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input4 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input2 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input5 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input86 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input87 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input85 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input84 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input90 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input94 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input93 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input92 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input91 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input89 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input88 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input6 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input2 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input7 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input471 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input472 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input470 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input469 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input474 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input1 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input473 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Network input: input3 +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: wrote bir.json +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: wrote tensor_map.json +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: wrote bir.json +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: wrote tensor_map.json +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: wrote bir.json +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: wrote tensor_map.json +2025-08-07T13:54:01Z INFO 47910 [job.Frontend.0]: Job #0 finished +2025-08-07T13:54:01Z INFO 47910 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-08-07T13:54:01Z INFO 47910 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-08-07T13:54:01Z INFO 47910 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-08-07T13:54:01Z INFO 47910 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-08-07T13:54:01Z INFO 47910 [job.WalrusDriver.0]: BackendDriver has 3 states with 1 core LNC +2025-08-07T13:54:01Z INFO 47910 [job.WalrusDriver.0]: BackendDriver MT cwd: /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/neuronxcc-_bze9vv6 +2025-08-07T13:54:01Z INFO 47910 [job.BIRLinker.1]: Creating directory sgLnk/sg00 +2025-08-07T13:54:01Z INFO 47910 [job.WalrusDriver.0]: StateId sg00 Dir /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/neuronxcc-_bze9vv6/sg00 +2025-08-07T13:54:01Z INFO 47910 [job.WalrusDriver.0]: StateId sg01 Dir /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/neuronxcc-_bze9vv6/sg01 +2025-08-07T13:54:01Z INFO 47910 [job.WalrusDriver.0]: StateId sg02 Dir /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/neuronxcc-_bze9vv6/sg02 +2025-08-07T13:54:01Z INFO 47910 [job.WalrusDriver.0]: Number of subgraphs to link: 3 +2025-08-07T13:54:01Z INFO 47910 [job.WalrusDriver.0]: lnkState: {"model": ["/home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/neuronxcc-_bze9vv6/sgLnk/sg00", "state_id": "sgLnk"} +2025-08-07T13:54:01Z INFO 47910 [job.WalrusDriver.0]: BackendDriver in_state.num_states 3 with 1 core LNC +2025-08-07T13:54:01Z INFO 47910 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/log-neuron-cc.txt -o walrus_bir.out.json --enable-call-graph --enable-mt-backend --link-subgraphs sg00,sg01,sg02 --link-dir sgLnk/sg00 --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-lower-bound 0.14 --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --enable-internal-partitioner --dge-levels vector_dynamic_offsets,scalar_dynamic_offset,io --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.neff +2025-08-07T13:54:01Z INFO 47910 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/neuronxcc-_bze9vv6 +2025-08-07T13:54:01Z INFO 47910 [job.WalrusDriver.0]: propagate_exit=True +2025-08-07T13:54:01Z INFO 47910 [job.WalrusDriver.0]: use_logger=False +2025-08-07T13:54:01Z INFO 47910 [job.WalrusDriver.0]: expose_stderr=True +2025-08-07T13:54:01Z INFO 49164 [Logging]: Logging to ../log-neuron-cc.txt at level 'INFO' +2025-08-07T13:54:01Z INFO 49164 [BackendDriver]: max_allowed_parallelism=128 +2025-08-07T13:54:01Z INFO 49164 [BackendDriver]: Loading module from sg00/bir.json +2025-08-07T13:54:01Z INFO 49164 [BackendDriver]: Loading module from sg01/bir.json +2025-08-07T13:54:01Z INFO 49164 [BackendDriver]: Loading module from sg02/bir.json +2025-08-07T13:54:01Z INFO 49164 [BackendDriver]: Backend driver mtBackend: true numModules: 3 Cwd: "/home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/neuronxcc-_bze9vv6" +2025-08-07T13:54:01Z INFO 49164 [BackendDriver]: DynamicDMA is enabled +2025-08-07T13:54:01Z INFO 49164 [BackendDriver]: DynamicDMA levels being enabled: io, scalar_dynamic_offset, vector_dynamic_offsets, +2025-08-07T13:54:01Z INFO 49164 [BackendDriver]: Modular flow call graph is enabled +2025-08-07T13:54:01Z INFO 49164 [BackendDriver]: Internal partitioner is enabled +2025-08-07T13:54:01Z USER 49164 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:01Z INFO 49164 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=661 blocks=3 instructions=1093 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49164 (sg00) [ModuleForkPass]: Running do_nothing +2025-08-07T13:54:01Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=188 blocks=1 instructions=71 Max writers: 5 Max Readers: 9 +2025-08-07T13:54:01Z USER 49164 (sg00) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-08-07T13:54:01Z USER 49164 (sg02) [ModuleForkPass]: Running do_nothing +2025-08-07T13:54:01Z USER 49164 (sg01) [ModuleForkPass]: Running do_nothing +2025-08-07T13:54:01Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 70mb, ru_maxrss: 201mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 188 memory location(s), 1 block(s), and 71 instruction(s). Max writers: 5 Max Readers: 9 +2025-08-07T13:54:01Z USER 49164 (sg00) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:01Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=135 blocks=1 instructions=60 Max writers: 2 Max Readers: 8 +2025-08-07T13:54:01Z USER 49164 (sg01) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=188 blocks=1 instructions=71 Max writers: 5 Max Readers: 9 +2025-08-07T13:54:01Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 70mb, ru_maxrss: 201mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 135 memory location(s), 1 block(s), and 60 instruction(s). Max writers: 2 Max Readers: 8 +2025-08-07T13:54:01Z USER 49164 (sg01) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:01Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=135 blocks=1 instructions=60 Max writers: 2 Max Readers: 8 +2025-08-07T13:54:01Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=338 blocks=1 instructions=962 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49164 (sg02) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 70mb, ru_maxrss: 201mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 338 memory location(s), 1 block(s), and 962 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49164 (sg02) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:01Z WARNING 49164 [birverifier::InstVisitor]: (sg00) Non - output memory location with no reader: {convert.270.1786}@SB<0,0>(1x2)#Internal DebugInfo: +2025-08-07T13:54:01Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=338 blocks=1 instructions=962 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49164 (sg00) [ModuleForkPass]: birverifier finished after 0.007 seconds +2025-08-07T13:54:01Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 86mb, ru_maxrss: 201mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 188 memory location(s), 1 block(s), and 71 instruction(s). Max writers: 5 Max Readers: 9 +2025-08-07T13:54:01Z USER 49164 (sg01) [ModuleForkPass]: birverifier finished after 0.021 seconds +2025-08-07T13:54:01Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 124mb, ru_maxrss: 201mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 135 memory location(s), 1 block(s), and 60 instruction(s). Max writers: 2 Max Readers: 8 +2025-08-07T13:54:01Z USER 49164 (sg02) [ModuleForkPass]: birverifier finished after 0.142 seconds +2025-08-07T13:54:01Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 273mb, ru_maxrss: 273mb (delta=72mb) +2025-08-07T13:54:01Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 338 memory location(s), 1 block(s), and 962 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49164 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:01Z USER 49164 [BackendPassManager]: mod_parallel_pass finished after 0.145 seconds +2025-08-07T13:54:01Z INFO 49164 [BackendPassManager]: curr_vmrss: 265mb, ru_maxrss: 273mb (delta=72mb) +2025-08-07T13:54:01Z INFO 49164 [BackendPassManager]: Output has 3 module(s), 3 function(s), 661 memory location(s), 3 block(s), and 1093 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49164 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:01Z INFO 49164 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=661 blocks=3 instructions=1093 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49164 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:01Z INFO 49164 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=188 blocks=1 instructions=71 Max writers: 5 Max Readers: 9 +2025-08-07T13:54:01Z USER 49164 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49164 (sg00) [SubgraphForkPass]: curr_vmrss: 265mb, ru_maxrss: 273mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49164 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 188 memory location(s), 1 block(s), and 71 instruction(s). Max writers: 5 Max Readers: 9 +2025-08-07T13:54:01Z USER 49164 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:01Z USER 49164 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:01Z INFO 49164 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=135 blocks=1 instructions=60 Max writers: 2 Max Readers: 8 +2025-08-07T13:54:01Z USER 49164 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49164 (sg01) [SubgraphForkPass]: curr_vmrss: 265mb, ru_maxrss: 273mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49164 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 135 memory location(s), 1 block(s), and 60 instruction(s). Max writers: 2 Max Readers: 8 +2025-08-07T13:54:01Z INFO 49164 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=338 blocks=1 instructions=962 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49164 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49164 (sg02) [SubgraphForkPass]: curr_vmrss: 265mb, ru_maxrss: 273mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49164 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 338 memory location(s), 1 block(s), and 962 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49164 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:01Z USER 49164 [BackendPassManager]: subgraph_parallel_pass finished after 0.001 seconds +2025-08-07T13:54:01Z INFO 49164 [BackendPassManager]: curr_vmrss: 265mb, ru_maxrss: 273mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49164 [BackendPassManager]: Output has 3 module(s), 3 function(s), 661 memory location(s), 3 block(s), and 1093 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49164 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:01Z INFO 49164 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=661 blocks=3 instructions=1093 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49164 (sg00) [ModuleForkPass]: Running expand_replication +2025-08-07T13:54:01Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=188 blocks=1 instructions=71 Max writers: 5 Max Readers: 9 +2025-08-07T13:54:01Z INFO 49164 (sg00) [ExpandReplication]: Found 0 replicated matmults +2025-08-07T13:54:01Z USER 49164 (sg00) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 265mb, ru_maxrss: 273mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 188 memory location(s), 1 block(s), and 71 instruction(s). Max writers: 5 Max Readers: 9 +2025-08-07T13:54:01Z USER 49164 (sg00) [ModuleForkPass]: Running unroll +2025-08-07T13:54:01Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=188 blocks=1 instructions=71 Max writers: 5 Max Readers: 9 +2025-08-07T13:54:01Z INFO 49164 (sg00) [Unroll]: INFO (Unroll) Start unrolling at Thu Aug 7 13:54:01 2025 +2025-08-07T13:54:01Z USER 49164 (sg01) [ModuleForkPass]: Running expand_replication +2025-08-07T13:54:01Z USER 49164 (sg02) [ModuleForkPass]: Running expand_replication +2025-08-07T13:54:01Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=135 blocks=1 instructions=60 Max writers: 2 Max Readers: 8 +2025-08-07T13:54:01Z INFO 49164 (sg01) [ExpandReplication]: Found 0 replicated matmults +2025-08-07T13:54:01Z USER 49164 (sg01) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 265mb, ru_maxrss: 273mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 135 memory location(s), 1 block(s), and 60 instruction(s). Max writers: 2 Max Readers: 8 +2025-08-07T13:54:01Z USER 49164 (sg01) [ModuleForkPass]: Running unroll +2025-08-07T13:54:01Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=135 blocks=1 instructions=60 Max writers: 2 Max Readers: 8 +2025-08-07T13:54:01Z INFO 49164 (sg01) [Unroll]: INFO (Unroll) Start unrolling at Thu Aug 7 13:54:01 2025 +2025-08-07T13:54:01Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=338 blocks=1 instructions=962 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z INFO 49164 (sg02) [ExpandReplication]: Found 0 replicated matmults +2025-08-07T13:54:01Z USER 49164 (sg02) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 266mb, ru_maxrss: 273mb (delta=0mb) +2025-08-07T13:54:01Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 338 memory location(s), 1 block(s), and 962 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z USER 49164 (sg02) [ModuleForkPass]: Running unroll +2025-08-07T13:54:01Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=338 blocks=1 instructions=962 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:01Z INFO 49164 (sg02) [Unroll]: INFO (Unroll) Start unrolling at Thu Aug 7 13:54:01 2025 +2025-08-07T13:54:01Z INFO 49164 (sg00) [Unroll]: INFO (Unroll) DONE unrolling Thu Aug 7 13:54:01 2025 + +2025-08-07T13:54:01Z INFO 49164 (sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-08-07T13:54:01Z INFO 49164 (sg00) [Unroll]: Total count: 3755 +2025-08-07T13:54:01Z INFO 49164 (sg00) [Unroll]: Matmult: 1976 +2025-08-07T13:54:01Z INFO 49164 (sg00) [Unroll]: TensorScalarPtr: 425 +2025-08-07T13:54:01Z INFO 49164 (sg00) [Unroll]: GenericCopy: 325 +2025-08-07T13:54:01Z INFO 49164 (sg00) [Unroll]: TensorTensor: 322 +2025-08-07T13:54:01Z INFO 49164 (sg00) [Unroll]: Load: 183 +2025-08-07T13:54:01Z INFO 49164 (sg00) [Unroll]: Activation: 179 +2025-08-07T13:54:01Z INFO 49164 (sg00) [Unroll]: TensorReduce: 128 +2025-08-07T13:54:01Z INFO 49164 (sg00) [Unroll]: TensorScalarAffineSelect: 64 +2025-08-07T13:54:01Z INFO 49164 (sg00) [Unroll]: Memset: 57 +2025-08-07T13:54:01Z INFO 49164 (sg00) [Unroll]: Save: 44 +2025-08-07T13:54:01Z INFO 49164 (sg00) [Unroll]: DMACopy: 38 +2025-08-07T13:54:01Z INFO 49164 (sg00) [Unroll]: Reciprocal: 8 +2025-08-07T13:54:01Z INFO 49164 (sg00) [Unroll]: Iota: 4 +2025-08-07T13:54:01Z INFO 49164 (sg00) [Unroll]: CollectiveCompute: 2 +2025-08-07T13:54:01Z INFO 49164 (sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 36 +2025-08-07T13:54:01Z USER 49164 (sg00) [ModuleForkPass]: unroll finished after 0.041 seconds +2025-08-07T13:54:01Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 336mb, ru_maxrss: 336mb (delta=63mb) +2025-08-07T13:54:01Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1898 memory location(s), 1 block(s), and 3755 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:01Z INFO 49164 (sg01) [Unroll]: INFO (Unroll) DONE unrolling Thu Aug 7 13:54:01 2025 + +2025-08-07T13:54:01Z INFO 49164 (sg01) [Unroll]: sg0001 Instruction count after Unroll: +2025-08-07T13:54:01Z INFO 49164 (sg01) [Unroll]: Total count: 9982 +2025-08-07T13:54:01Z INFO 49164 (sg01) [Unroll]: Matmult: 6780 +2025-08-07T13:54:01Z INFO 49164 (sg01) [Unroll]: Load: 1867 +2025-08-07T13:54:01Z INFO 49164 (sg01) [Unroll]: GenericCopy: 342 +2025-08-07T13:54:01Z INFO 49164 (sg01) [Unroll]: TensorScalarPtr: 264 +2025-08-07T13:54:01Z INFO 49164 (sg01) [Unroll]: TensorTensor: 224 +2025-08-07T13:54:01Z INFO 49164 (sg01) [Unroll]: Activation: 218 +2025-08-07T13:54:01Z INFO 49164 (sg01) [Unroll]: TensorReduce: 128 +2025-08-07T13:54:01Z INFO 49164 (sg01) [Unroll]: Select: 64 +2025-08-07T13:54:01Z INFO 49164 (sg01) [Unroll]: Save: 41 +2025-08-07T13:54:01Z INFO 49164 (sg01) [Unroll]: DMACopy: 34 +2025-08-07T13:54:01Z INFO 49164 (sg01) [Unroll]: Memset: 10 +2025-08-07T13:54:01Z INFO 49164 (sg01) [Unroll]: Reciprocal: 8 +2025-08-07T13:54:01Z INFO 49164 (sg01) [Unroll]: CollectiveCompute: 2 +2025-08-07T13:54:01Z INFO 49164 (sg01) [Unroll]: Unrolled DGE count with Dynamic AP: 32 +2025-08-07T13:54:01Z USER 49164 (sg01) [ModuleForkPass]: unroll finished after 0.111 seconds +2025-08-07T13:54:01Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 394mb, ru_maxrss: 394mb (delta=121mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2265 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: INFO (Unroll) DONE unrolling Thu Aug 7 13:54:01 2025 + +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: sg0002 Instruction count after Unroll: +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: Total count: 54605 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: Matmult: 43667 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: GenericCopy: 6122 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: Load: 2962 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: Save: 633 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: Max: 224 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: MaxIndex: 224 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: MatchReplace: 217 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: TensorScalarPtr: 214 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: TensorTensor: 153 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: Activation: 117 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: Gather: 35 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: Memset: 12 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: TensorReduce: 8 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: StreamShuffle: 4 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: CollectiveCompute: 3 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: Select: 3 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: Reciprocal: 3 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: Iota: 2 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: DMACopy: 2 +2025-08-07T13:54:02Z INFO 49164 (sg02) [Unroll]: Unrolled DGE count with Dynamic AP: 1 +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: unroll finished after 0.540 seconds +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 599mb, ru_maxrss: 599mb (delta=326mb) +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11071 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:02Z USER 49164 [BackendPassManager]: mod_parallel_pass finished after 0.553 seconds +2025-08-07T13:54:02Z INFO 49164 [BackendPassManager]: curr_vmrss: 433mb, ru_maxrss: 599mb (delta=326mb) +2025-08-07T13:54:02Z INFO 49164 [BackendPassManager]: Output has 3 module(s), 3 function(s), 15234 memory location(s), 3 block(s), and 68342 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:02Z INFO 49164 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=15234 blocks=3 instructions=68342 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 (sg00) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:02Z INFO 49164 (sg00) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=1898 blocks=1 instructions=3755 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg01) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:02Z USER 49164 (sg02) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:02Z INFO 49164 (sg01) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=2265 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 (sg02) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=11071 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49164 (sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49164 (sg00) [SubgraphForkPass]: dead_code_elim finished after 0.007 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [SubgraphForkPass]: curr_vmrss: 434mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 1785 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49164 (sg01) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg01) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg01) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49164 (sg01) [SubgraphForkPass]: dead_code_elim finished after 0.040 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [SubgraphForkPass]: curr_vmrss: 453mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 2193 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 (sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49164 (sg02) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg02) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg02) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49164 (sg02) [SubgraphForkPass]: dead_code_elim finished after 0.102 seconds +2025-08-07T13:54:02Z INFO 49164 (sg02) [SubgraphForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 11038 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:02Z USER 49164 [BackendPassManager]: subgraph_parallel_pass finished after 0.104 seconds +2025-08-07T13:54:02Z INFO 49164 [BackendPassManager]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 [BackendPassManager]: Output has 3 module(s), 3 function(s), 15016 memory location(s), 3 block(s), and 68341 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:02Z INFO 49164 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=15016 blocks=3 instructions=68341 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=1785 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2193 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=11038 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: birverifier finished after 0.005 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1785 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: birverifier finished after 0.009 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2193 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: birverifier finished after 0.055 seconds +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11038 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:02Z USER 49164 [BackendPassManager]: mod_parallel_pass finished after 0.057 seconds +2025-08-07T13:54:02Z INFO 49164 [BackendPassManager]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 [BackendPassManager]: Output has 3 module(s), 3 function(s), 15016 memory location(s), 3 block(s), and 68341 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:02Z INFO 49164 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=15016 blocks=3 instructions=68341 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:02Z INFO 49164 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=1785 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:02Z USER 49164 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:02Z INFO 49164 (sg00) [SubgraphForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z USER 49164 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:02Z INFO 49164 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 1785 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=2193 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [SubgraphForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 2193 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=11038 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49164 (sg02) [SubgraphForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 11038 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:02Z USER 49164 [BackendPassManager]: subgraph_parallel_pass finished after 0.003 seconds +2025-08-07T13:54:02Z INFO 49164 [BackendPassManager]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 [BackendPassManager]: Output has 3 module(s), 3 function(s), 15016 memory location(s), 3 block(s), and 68341 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:02Z INFO 49164 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=15016 blocks=3 instructions=68341 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running instruction_reorder +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=1785 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running instruction_reorder +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: Running instruction_reorder +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=2193 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: instruction_reorder finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=11038 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1785 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running psum_legalization +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=1785 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1785 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=1785 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1785 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running pre_opts +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=1785 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreOpts]: Skipped. No pre-opt passes enabled +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1785 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running error_injector +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=1785 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z WARNING 49164 (sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1785 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running vn_splitter +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=1785 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: instruction_reorder finished after 0.003 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2193 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running psum_legalization +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=2193 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: psum_legalization finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2193 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running legalize_cce_dma +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=2193 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-08-07T13:54:02Z INFO 49164 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.001 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0 seconds +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: vn_splitter finished after 0.003 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1785 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running constant_propagate +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=1785 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: legalize_cce_dma finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2193 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running pre_opts +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=2193 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreOpts]: Skipped. No pre-opt passes enabled +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2193 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running error_injector +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=2193 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z WARNING 49164 (sg01) [ErrorInjector]: Unrecognized injected error value "0" +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2193 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running vn_splitter +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=2193 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 (sg01) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 1 +2025-08-07T13:54:02Z INFO 49164 (sg01) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49164 (sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49164 (sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: constant_propagate finished after 0.007 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1785 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running lower_ac +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=1785 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 459mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1785 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=1785 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-08-07T13:54:02Z INFO 49164 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-08-07T13:54:02Z INFO 49164 (sg01) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.002 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.002 seconds +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: vn_splitter finished after 0.007 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2193 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running constant_propagate +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1785 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running remat_optimization +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=2193 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=1785 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: instruction_reorder finished after 0.014 seconds +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [RematOpt]: Removed 0 remat instructions +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: remat_optimization finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11038 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1785 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: Running psum_legalization +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running early_peephole_opts +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=1785 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-08-07T13:54:02Z INFO 49164 (sg00) [EarlyPeepholeOpts]: Activation Accumulate: 0 +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=11038 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: early_peephole_opts finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1785 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=1785 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1785 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running infer_stream_ids +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=1785 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 460mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1785 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running pre_sched +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=1785 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 [LayerSpiller]: LayerSpill: Start... +2025-08-07T13:54:02Z INFO 49164 [LayerSpiller]: LayerSpill: Found 1 Splits CCs +2025-08-07T13:54:02Z INFO 49164 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-08-07T13:54:02Z INFO 49164 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-08-07T13:54:02Z INFO 49164 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-08-07T13:54:02Z INFO 49164 [LayerSpiller]: LayerSpill: Done. +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: Start split live ranges Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: No split opportunities: +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: End split live ranges Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: Strt remove redundncies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: remove_redundant_memsets +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: remove_redundant_memsets: 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: remove_redundant_loads +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: remove_redundant_loads: 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: End remove redundncies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: Start DCE Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: psum_legalization finished after 0.007 seconds +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 461mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11038 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: Running legalize_cce_dma +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=11038 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: End DCE Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: Start build flow dependencies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg00) [build_flow_deps]: Start build fdeps. Invocation: 1Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg00) [build_flow_deps]: Allocs: 1785 instructions: 3754 +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: legalize_cce_dma finished after 0.009 seconds +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 461mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11038 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: Running pre_opts +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=11038 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z INFO 49164 (sg02) [PreOpts]: Skipped. No pre-opt passes enabled +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 461mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [build_flow_deps]: Build fdeps inserted 9414 edges +2025-08-07T13:54:02Z INFO 49164 (sg00) [build_flow_deps]: Done build fdeps 9414 Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: End build flow dependencies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: Start remove useless insts Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: remove_useless_insts +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11038 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: Running error_injector +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=11038 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z WARNING 49164 (sg02) [ErrorInjector]: Unrecognized injected error value "0" +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 461mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: remove Useless Instructions: 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: End remove useless insts Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: Start scratchpad optimization Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: End scratchpad optimization Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11038 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: Running vn_splitter +2025-08-07T13:54:02Z INFO 49164 (sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=11038 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z INFO 49164 (sg02) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 14 +2025-08-07T13:54:02Z INFO 49164 (sg02) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PreSched]: DONE PRE scheduling Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: pre_sched finished after 0.021 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 461mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1785 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=1785 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49164 (sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.004 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 462mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1785 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=1785 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 462mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1786 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=1786 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 462mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1786 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=1786 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg00) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:02Z INFO 49164 (sg00) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:02Z INFO 49164 (sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg01) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: allocating PSUM +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: main loop +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: renumber locations +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: size = 481 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: build_no_bitmap start +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: found 1039 edges +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: mean: 4.32017 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: median: 3.49593 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: adjacency vectors require 8312 bytes +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: build_no_bitmap done +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: find costs +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: simplify interference graph +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: initialize low and high +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: lo = 481 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: hi = 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: inf = 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: total = 481 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: simplify +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: new candidates = 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: select ranges +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: no more spills +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-08-07T13:54:02Z INFO 49164 (sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.007 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 462mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1786 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=1786 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 462mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1786 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running address_rotation_psum +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=1786 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 38 PSUM Banks +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 20 PSUM Banks +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 112 PSUM Banks +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: address_rotation_psum finished after 0.006 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 462mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1786 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=1786 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 50391808 +2025-08-07T13:54:02Z INFO 49164 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2174 bytes +2025-08-07T13:54:02Z INFO 49164 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 11010050 +2025-08-07T13:54:02Z INFO 49164 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2000 bytes +2025-08-07T13:54:02Z INFO 49164 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 3166208 +2025-08-07T13:54:02Z INFO 49164 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 338 bytes +2025-08-07T13:54:02Z INFO 49164 (sg00) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:02Z INFO 49164 (sg00) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: allocating SB +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: main loop +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: renumber locations +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: size = 1276 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: find partners +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: found 310 accumulation groups +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: largest = custom-call.226.1531 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: tensors = 33 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: requires 66048 bytes/partition +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: expanding partners +2025-08-07T13:54:02Z INFO 49164 (sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49164 []: find first defs for local +2025-08-07T13:54:02Z INFO 49164 []: find first defs for global +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: find loads +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: 1 pin count +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: 109 remat count +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: build interference graph +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: pass 1 int-tree +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: Num intervals 1276 Num locations 1276 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: info.neighbors init Done +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: edge: 74840 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: mean: 117.304 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: median: 87.5693 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: find costs +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: simplify interference graph +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: safe = 501 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: unsafe = 660 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: inf = 114 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: total = 1275 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: simplify +2025-08-07T13:54:02Z INFO 49164 (sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 586 #Pinned 0 #Safe 0 minCost 0.00105388 maxCost 0.0482429 locations 1276 +2025-08-07T13:54:02Z INFO 49164 (sg02) [ShrinkDN]: INFO (ShrinkDN): Shrunk 2 nodes. Total savings 14336 bytes/partition +2025-08-07T13:54:02Z INFO 49164 (sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: new candidates = 84 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: select ranges +2025-08-07T13:54:02Z INFO 49164 (sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: constant_propagate finished after 0.055 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 464mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2193 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running lower_ac +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=2193 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 (sg01) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: lower_ac finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 464mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2193 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running input_dma_coalescing +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=2193 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: Total: 1275 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: Allocated: 1.000 (1275) +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: Rover zone: 0.336 (428) +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: Pre-rover zone: 0.009 (11) +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: Post-rover zone: 0.656 (836) +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: Blocks tall: 1.000 (1275) +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.998 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: Success +2025-08-07T13:54:02Z INFO 49164 (sg01) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: input_dma_coalescing finished after 0.003 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 464mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2193 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running remat_optimization +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=2193 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: SB spills = 0 tensors +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: remats = 0 tensors +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: SB score = 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: pinning saved approximately 9010 cycles +2025-08-07T13:54:02Z INFO 49164 (sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-08-07T13:54:02Z INFO 49164 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 50391808 +2025-08-07T13:54:02Z INFO 49164 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2174 bytes +2025-08-07T13:54:02Z INFO 49164 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 11010050 +2025-08-07T13:54:02Z INFO 49164 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2000 bytes +2025-08-07T13:54:02Z INFO 49164 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 3166208 +2025-08-07T13:54:02Z INFO 49164 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 338 bytes +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.018 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 464mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1786 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=1786 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg01) [RematOpt]: Removed 0 remat instructions +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: remat_optimization finished after 0.003 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 464mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2193 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running early_peephole_opts +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=2193 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 (sg01) [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-08-07T13:54:02Z INFO 49164 (sg01) [EarlyPeepholeOpts]: Activation Accumulate: 0 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: early_peephole_opts finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 464mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2193 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=2193 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: address_rotation_sb finished after 0.002 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 464mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1786 memory location(s), 1 block(s), and 3754 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=1786 blocks=1 instructions=3754 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 464mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2193 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running infer_stream_ids +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=2193 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 61401858, 68.4071% input load, 7.68477% output write, 23.9082% spill/reload [sg0000] +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 464mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2193 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running pre_sched +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=2193 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: Start PRE scheduling 2 cores: 1 at: Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 [LayerSpiller]: LayerSpill: Start... +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:02Z INFO 49164 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-08-07T13:54:02Z INFO 49164 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: removed 0 identical load +2025-08-07T13:54:02Z INFO 49164 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-08-07T13:54:02Z INFO 49164 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-08-07T13:54:02Z INFO 49164 [LayerSpiller]: LayerSpill: Done. +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: Start split live ranges Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: No split opportunities: +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: End split live ranges Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: Strt remove redundncies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: remove_redundant_memsets +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: remove_redundant_memsets: 0 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: remove_redundant_loads +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 2097152, 3.41545% out of total dma traffic(4.20032e+07) +2025-08-07T13:54:02Z INFO 49164 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-08-07T13:54:02Z INFO 49164 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: remove_redundant_loads: 0 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: End remove redundncies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: Start DCE Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg02) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-08-07T13:54:02Z INFO 49164 (sg02) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.016 seconds +2025-08-07T13:54:02Z INFO 49164 (sg02) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.016 seconds +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: vn_splitter finished after 0.050 seconds +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 464mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11038 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: Running constant_propagate +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=11038 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: average loaded DMA size 2286 bytes +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: average saved DMA size 2000 bytes +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 48294656 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 2286 bytes +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 11010050 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2000 bytes +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 2097152, 3.41545% out of total dma traffic +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 59304706, 70.8261% input load, 7.95653% output write, 21.2174% spill/reload [sg0000] +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 48294656 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 2286 bytes +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 11010050 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2000 bytes +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 3166208 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 338 bytes +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 1735 bytes +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.016 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: SB Rotation rotated 49 Sb address +2025-08-07T13:54:02Z INFO 49164 (sg02) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: SB Rotation rotated 40 Sb address +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: End DCE Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: SB Rotation rotated 259 Sb address +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: Start build flow dependencies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg01) [build_flow_deps]: Start build fdeps. Invocation: 2Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg01) [build_flow_deps]: Allocs: 2193 instructions: 9982 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: address_rotation_sb finished after 0.022 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:02Z INFO 49164 (sg00) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: reserved space = 677442310 bytes +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: spill space = 14680064 bytes +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: aligned spill space = 14680064 bytes +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: renumber locations +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: size = 4 +2025-08-07T13:54:02Z INFO 49164 []: find first defs for local +2025-08-07T13:54:02Z INFO 49164 []: find first defs for global +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: Num intervals 4 Num locations 4 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: simplify interference graph +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: initialize low and high +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: lo = 4 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: hi = 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: total = 4 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: simplify +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: select ranges +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: allreduce_dram_hwm 14680064 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: Real CC buffer size 14680064 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: DRAM hwm after allocation: 14680064 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.003 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running address_rotation_dram +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: DRAM hwm before rotation 14680064 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: allreduce hwm 14680064 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: Real CC buffer size 14680064 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: DRAM hwm after rotation 14680064 +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: address_rotation_dram finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-08-07T13:54:02Z INFO 49164 (sg00) [TensorCopyAccel::Impl]: Accelerated 0 out of 381 tensorcopy in Function: sg0000 average acceleration factor: -nan +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running peephole_opts +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: peephole_opts finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running lower_kernel +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [LowerKernel]: Started running LowerKernel +2025-08-07T13:54:02Z INFO 49164 (sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 3738, number of allocs: 1769 +2025-08-07T13:54:02Z INFO 49164 (sg00) [LowerKernel]: Scan BKs time (s): 0.000117 +2025-08-07T13:54:02Z INFO 49164 (sg00) [LowerKernel]: Lower BKs time (s): 5e-06 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: lower_kernel finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: birverifier finished after 0.003 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running build_fdeps +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [build_flow_deps]: Start build fdeps. Invocation: 3Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg00) [build_flow_deps]: Allocs: 1769 instructions: 3738 +2025-08-07T13:54:02Z INFO 49164 (sg01) [build_flow_deps]: Build fdeps inserted 28584 edges +2025-08-07T13:54:02Z INFO 49164 (sg01) [build_flow_deps]: Done build fdeps 28584 Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: End build flow dependencies Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: Start remove useless insts Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: remove_useless_insts +2025-08-07T13:54:02Z INFO 49164 (sg00) [build_flow_deps]: Build fdeps inserted 9398 edges +2025-08-07T13:54:02Z INFO 49164 (sg00) [build_flow_deps]: Done build fdeps 9398 Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: build_fdeps finished after 0.007 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running remove_redundancies +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: remove Useless Instructions: 0 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: End remove useless insts Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: Start scratchpad optimization Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [RemoveRedundancies]: remove_useless_insts +2025-08-07T13:54:02Z INFO 49164 (sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: remove_redundancies finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: End scratchpad optimization Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:02Z INFO 49164 (sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:02Z INFO 49164 (sg00) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:02Z INFO 49164 (sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49164 (sg01) [PreSched]: DONE PRE scheduling Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: pre_sched finished after 0.071 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2193 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2193 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 (sg01) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.021 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 466mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:02Z INFO 49164 (sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.004 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 466mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 466mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running post_sched +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 [post_scheduler]: Start PosT ScheD 3 sunda Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49164 (sg01) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg01) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg01) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.028 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 467mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2193 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running dynamic_dma_setup +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=2193 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 467mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2194 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running runtime_memory_reservation +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=2194 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 467mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2194 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running coloring_allocator_psum +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=2194 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 (sg01) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:02Z INFO 49164 (sg01) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: allocating PSUM +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: main loop +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: renumber locations +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: size = 498 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: build_no_bitmap start +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: found 1124 edges +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: mean: 4.51406 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: median: 4.66235 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: adjacency vectors require 8992 bytes +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: build_no_bitmap done +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: find costs +2025-08-07T13:54:02Z INFO 49164 (sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 [post_scheduler]: Time-aware hwm post-sched +2025-08-07T13:54:02Z INFO 49164 (sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: simplify interference graph +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: initialize low and high +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: lo = 498 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: hi = 0 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: inf = 0 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: total = 498 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: simplify +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: new candidates = 0 +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: select ranges +2025-08-07T13:54:02Z INFO 49164 (sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: no more spills +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-08-07T13:54:02Z INFO 49164 (sg01) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: coloring_allocator_psum finished after 0.040 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 468mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2194 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running dma_optimization_psum +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=2194 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 [post_scheduler]: Time-aware simulation time: 1083521 +2025-08-07T13:54:02Z INFO 49164 (sg02) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-08-07T13:54:02Z INFO 49164 (sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:02Z INFO 49164 (sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: dma_optimization_psum finished after 0.008 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 468mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2194 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running address_rotation_psum +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=2194 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 [post_scheduler]: Done PosT ScheD Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: post_sched finished after 0.065 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 468mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 468mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 20 PSUM Banks +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 115 PSUM Banks +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 111 PSUM Banks +2025-08-07T13:54:02Z INFO 49164 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 10 PSUM Banks +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 92 PSUM Banks +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: SB Rotation rotated 12 Sb address +2025-08-07T13:54:02Z INFO 49164 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 22 PSUM Banks +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: address_rotation_psum finished after 0.036 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 468mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2194 memory location(s), 1 block(s), and 9982 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z USER 49164 (sg01) [ModuleForkPass]: Running coloring_allocator_sb +2025-08-07T13:54:02Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=2194 blocks=1 instructions=9982 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:02Z INFO 49164 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 202179072 +2025-08-07T13:54:02Z INFO 49164 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 846 bytes +2025-08-07T13:54:02Z INFO 49164 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 12582914 +2025-08-07T13:54:02Z INFO 49164 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2457 bytes +2025-08-07T13:54:02Z INFO 49164 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 1064960 +2025-08-07T13:54:02Z INFO 49164 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-08-07T13:54:02Z INFO 49164 (sg01) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:02Z INFO 49164 (sg01) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: SB Rotation rotated 35 Sb address +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: SB Rotation rotated 6 Sb address +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: allocating SB +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: main loop +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: renumber locations +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: size = 1663 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: find partners +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: found 456 accumulation groups +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: largest = _dot.6-t1033_i24 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: tensors = 96 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: requires 98304 bytes/partition +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: expanding partners +2025-08-07T13:54:02Z INFO 49164 (sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: SB Rotation rotated 188 Sb address +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z INFO 49164 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: address_rotation_sb finished after 0.058 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 471mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:02Z INFO 49164 (sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:02Z INFO 49164 (sg00) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:02Z INFO 49164 []: find first defs for local +2025-08-07T13:54:02Z INFO 49164 []: find first defs for global +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: find loads +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: 1 pin count +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: 257 remat count +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: build interference graph +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: pass 1 int-tree +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.018 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 472mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:02Z INFO 49164 (sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-08-07T13:54:02Z INFO 49164 (sg00) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Num intervals 1663 Num locations 1663 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: info.neighbors init Done +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.002 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 472mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running dep_opt +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [build_flow_deps]: Start build fdeps. Invocation: 4Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg00) [build_flow_deps]: Allocs: 1769 instructions: 3738 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: edge: 98436 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: mean: 118.384 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: median: 86.9526 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: find costs +2025-08-07T13:54:02Z INFO 49164 (sg00) [build_flow_deps]: Build fdeps inserted 9167 edges +2025-08-07T13:54:02Z INFO 49164 (sg00) [build_flow_deps]: Done build fdeps 9167 Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: simplify interference graph +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: safe = 472 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: unsafe = 590 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: inf = 600 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: total = 1662 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: simplify +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 549 #Pinned 0 #Safe 0 minCost 0.00386939 maxCost 0.0655748 locations 1663 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: dep_opt finished after 0.011 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: Running report_stats +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 5 │ 2489321472 │ +│ DMACopy │ Internal -> ExternalOutput │ 32 │ 33554432 │ +│ DMACopy │ Internal -> Output │ 1 │ 8388608 │ +│ Load │ Const -> Internal │ 3 │ 49408 │ +│ Load │ ExternalInput -> Internal │ 116 │ 41953792 │ +│ Load │ Internal │ 48 │ 6291456 │ +│ Save │ Internal │ 32 │ 6291456 │ +│ Save │ Internal -> Output │ 12 │ 4718594 │ +└─────────────┴────────────────────────────┴───────┴────────────┘ + +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: new candidates = 228 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: select ranges +2025-08-07T13:54:02Z INFO 49164 (sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 3 │ +│ 64 │ 1 │ +│ 128 │ 1 │ +│ 256 │ 2 │ +│ 512 │ 1 │ +│ 1024 │ 66 │ +│ 2048 │ 82 │ +│ 4096 │ 60 │ +│ 262144 │ 32 │ +│ 4194304 │ 2 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:02Z INFO 49164 (sg00) [ReportStats]: MM Stats: #MatMults 1976 #MatMult-Transposes 512 +2025-08-07T13:54:02Z INFO 49164 (sg00) [ReportStats]: IO Tensor size combined: 668480004 +2025-08-07T13:54:02Z INFO 49164 (sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input76 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input77 │ ExternalInput │ bfloat16 │ 16777216 │ +│ input83 │ ExternalInput │ bfloat16 │ 16777216 │ +│ input78 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input81 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input5 │ ExternalInput │ bfloat16 │ 1048576 │ +│ input4 │ ExternalInput │ bfloat16 │ 1048576 │ +│ output1 │ ExternalOutput │ bfloat16 │ 1048576 │ +│ output2 │ ExternalOutput │ bfloat16 │ 1048576 │ +│ input79 │ ExternalInput │ bfloat16 │ 8192 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-08-07T13:54:02Z INFO 49164 (sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌────────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate4-buffer-2027 │ Internal │ bfloat16 │ 4194304 │ +│ intermediate1 │ Output │ bfloat16 │ 4194304 │ +│ dot.4-buffer-2025 │ Internal │ bfloat16 │ 4194304 │ +│ all_gather.1 │ Internal │ bfloat16 │ 4194304 │ +│ intermediate4 │ Output │ bfloat16 │ 4194304 │ +│ input78_local_1225_i1 │ Internal │ bfloat16 │ 2097152 │ +│ input78_local_1225_i0 │ Internal │ bfloat16 │ 2097152 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ transpose.1 │ Internal │ bfloat16 │ 2097152 │ +│ t541_pftranspose_1103-t1137_i3 │ Internal │ bfloat16 │ 524288 │ +└────────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-08-07T13:54:02Z USER 49164 (sg00) [ModuleForkPass]: report_stats finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Total: 1662 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Spilled: 0.006 (10) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Allocated: 0.994 (1652) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Rover zone: 0.663 (1096) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Pre-rover zone: 0.008 (14) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Post-rover zone: 0.328 (542) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Blocks tall: 1.000 (1652) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Visited until tall blocking (mean): 0.999 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Success +2025-08-07T13:54:02Z INFO 49164 (sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: SB spills = 10 tensors +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: size = 10240 bytes/partition +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: remats = 0 tensors +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: SB score = 35620 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: best SB heuristic = 0 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: collect spills +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: insert spills +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: locationsToDelete done +2025-08-07T13:54:02Z INFO 49164 (sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: main loop +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: renumber locations +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: size = 1673 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: find partners +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: found 456 accumulation groups +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: largest = _dot.6-t1033_i24 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: tensors = 96 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: requires 98304 bytes/partition +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: expanding partners +2025-08-07T13:54:02Z INFO 49164 (sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: constant_propagate finished after 0.269 seconds +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11038 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: Running lower_ac +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=11038 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z INFO 49164 []: find first defs for local +2025-08-07T13:54:02Z INFO 49164 []: find first defs for global +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: find loads +2025-08-07T13:54:02Z INFO 49164 (sg02) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: lower_ac finished after 0.014 seconds +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: 1 pin count +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: 267 remat count +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: build interference graph +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: pass 1 int-tree +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11038 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z USER 49164 (sg02) [ModuleForkPass]: Running input_dma_coalescing +2025-08-07T13:54:02Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=11038 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Num intervals 1673 Num locations 1673 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: info.neighbors init Done +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: edge: 97320 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: mean: 116.342 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: median: 80.7019 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: find costs +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: simplify interference graph +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: safe = 3 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: unsafe = 0 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: inf = 17 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: total = 20 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: simplify +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 1673 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: new candidates = 0 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: (including 17 infinite cost tensors) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: select ranges +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Total: 20 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Spilled: 0.000 (0) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Allocated: 1.000 (20) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Rover zone: 0.500 (10) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Pre-rover zone: 0.000 (0) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Post-rover zone: 0.500 (10) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Blocks tall: 1.000 (20) +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: Success +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: SB spills = 0 tensors +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: remats = 0 tensors +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:02Z INFO 49164 (sg01) [SB_Allocator]: SB score = 0 +2025-08-07T13:54:03Z INFO 49164 (sg01) [SB_Allocator]: spilling from SB cost about 35620 cycles +2025-08-07T13:54:03Z INFO 49164 (sg01) [SB_Allocator]: number of tensors spilled from SB = 10 +2025-08-07T13:54:03Z INFO 49164 (sg01) [SB_Allocator]: total size of spilled tensors = 10240 bytes/partition +2025-08-07T13:54:03Z INFO 49164 (sg01) [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-08-07T13:54:03Z INFO 49164 (sg01) [SB_Allocator]: pinning saved approximately 9010 cycles +2025-08-07T13:54:03Z INFO 49164 (sg01) [SB_Allocator]: 0% SB utilization after allocation +2025-08-07T13:54:03Z INFO 49164 (sg02) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-08-07T13:54:03Z USER 49164 (sg02) [ModuleForkPass]: input_dma_coalescing finished after 0.030 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 203489792 +2025-08-07T13:54:03Z INFO 49164 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 846 bytes +2025-08-07T13:54:03Z INFO 49164 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 13893634 +2025-08-07T13:54:03Z INFO 49164 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2170 bytes +2025-08-07T13:54:03Z INFO 49164 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 1064960 +2025-08-07T13:54:03Z INFO 49164 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11038 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:03Z USER 49164 (sg02) [ModuleForkPass]: Running remat_optimization +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: coloring_allocator_sb finished after 0.139 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2214 memory location(s), 1 block(s), and 10002 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=11038 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2214 blocks=1 instructions=10002 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: address_rotation_sb finished after 0.012 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2214 memory location(s), 1 block(s), and 10002 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running dma_optimization_sb +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=2214 blocks=1 instructions=10002 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 217383426, 89.1468% input load, 1.92945% output write, 8.92371% spill/reload [sg0001] +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: removed 0 identical load +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: sub-graph will get execute 35 times +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(1.9379e+08) +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 20 SpillSaves and Reloads +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: average loaded DMA size 849 bytes +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: average saved DMA size 2411 bytes +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 8 SpillSaves and Reloads +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: average loaded DMA size 850 bytes +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: average saved DMA size 2523 bytes +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 2 combined 0 SpillSaves and Reloads +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: average loaded DMA size 850 bytes +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: average saved DMA size 2523 bytes +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 203489792 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 850 bytes +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 13893634 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2523 bytes +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 0, 0% out of total dma traffic +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 217383426, 89.1468% input load, 1.92945% output write, 8.92371% spill/reload [sg0001] +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 203489792 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 850 bytes +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 13893634 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2523 bytes +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 1064960 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 130 bytes +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 863 bytes +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: dma_optimization_sb finished after 0.062 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 9988 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2192 blocks=1 instructions=9988 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: SB Rotation rotated 13 Sb address +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: SB Rotation rotated 219 Sb address +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: SB Rotation rotated 12 Sb address +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: SB Rotation rotated 84 Sb address +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: address_rotation_sb finished after 0.039 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 9988 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running coloring_allocator_dram +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=2192 blocks=1 instructions=9988 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 (sg01) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:03Z INFO 49164 (sg01) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:03Z INFO 49164 (sg02) [RematOpt]: Removed 0 remat instructions +2025-08-07T13:54:03Z USER 49164 (sg02) [ModuleForkPass]: remat_optimization finished after 0.114 seconds +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11038 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:03Z USER 49164 (sg02) [ModuleForkPass]: Running early_peephole_opts +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=11038 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:03Z INFO 49164 (sg02) [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: reserved space = 214499848 bytes +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: spill space = 22282240 bytes +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: aligned spill space = 22282240 bytes +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: renumber locations +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: size = 8 +2025-08-07T13:54:03Z INFO 49164 []: find first defs for local +2025-08-07T13:54:03Z INFO 49164 []: find first defs for global +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: Num intervals 8 Num locations 8 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: simplify interference graph +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: initialize low and high +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: lo = 8 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: hi = 0 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: total = 8 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: simplify +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: select ranges +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: allreduce_dram_hwm 16777216 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: Real CC buffer size 16777216 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: DRAM hwm after allocation: 20971520 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: coloring_allocator_dram finished after 0.012 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 9988 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running address_rotation_dram +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=2192 blocks=1 instructions=9988 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: Runtime page size at 512MB +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: DRAM hwm before rotation 20971520 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: allreduce hwm 16777216 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: Real CC buffer size 16777216 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: DRAM hwm after rotation 20971520 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: address_rotation_dram finished after 0.004 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 9988 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running tensorcopy_accel +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=2192 blocks=1 instructions=9988 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 (sg01) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-08-07T13:54:03Z INFO 49164 (sg01) [TensorCopyAccel::Impl]: Accelerated 16 out of 352 tensorcopy in Function: sg0001 average acceleration factor: 1 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 9988 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running peephole_opts +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=2192 blocks=1 instructions=9988 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 (sg01) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-08-07T13:54:03Z INFO 49164 (sg02) [EarlyPeepholeOpts]: Activation Accumulate: 0 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: peephole_opts finished after 0.003 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running lower_kernel +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 (sg01) [LowerKernel]: Started running LowerKernel +2025-08-07T13:54:03Z INFO 49164 (sg01) [LowerKernel]: Start of kernel lowering pass, number of insts: 10052, number of allocs: 2192 +2025-08-07T13:54:03Z INFO 49164 (sg01) [LowerKernel]: Scan BKs time (s): 0.000397 +2025-08-07T13:54:03Z INFO 49164 (sg01) [LowerKernel]: Lower BKs time (s): 5e-06 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: lower_kernel finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.001 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: birverifier finished after 0.006 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running dynamic_dma_scan +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: dynamic_dma_scan finished after 0.001 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running build_fdeps +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 (sg01) [build_flow_deps]: Start build fdeps. Invocation: 5Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z USER 49164 (sg02) [ModuleForkPass]: early_peephole_opts finished after 0.027 seconds +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [build_flow_deps]: Allocs: 2192 instructions: 10052 +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11038 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:03Z USER 49164 (sg02) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=11038 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:03Z USER 49164 (sg02) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.007 seconds +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11038 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:03Z USER 49164 (sg02) [ModuleForkPass]: Running infer_stream_ids +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=11038 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:03Z USER 49164 (sg02) [ModuleForkPass]: infer_stream_ids finished after 0.007 seconds +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11038 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:03Z USER 49164 (sg02) [ModuleForkPass]: Running pre_sched +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=11038 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: Start PRE scheduling 2 cores: 1 at: Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49164 [LayerSpiller]: LayerSpill: Start... +2025-08-07T13:54:03Z INFO 49164 (sg01) [build_flow_deps]: Build fdeps inserted 28725 edges +2025-08-07T13:54:03Z INFO 49164 (sg01) [build_flow_deps]: Done build fdeps 28725 Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: build_fdeps finished after 0.020 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running remove_redundancies +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 (sg01) [RemoveRedundancies]: remove_clobbered_writes +2025-08-07T13:54:03Z INFO 49164 (sg01) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-08-07T13:54:03Z INFO 49164 (sg01) [RemoveRedundancies]: remove_useless_insts +2025-08-07T13:54:03Z INFO 49164 (sg01) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: remove_redundancies finished after 0.003 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 (sg01) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:03Z INFO 49164 (sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:03Z INFO 49164 (sg01) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:03Z INFO 49164 [LayerSpiller]: LayerSpill: Found 1 Splits CCs +2025-08-07T13:54:03Z INFO 49164 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-08-07T13:54:03Z INFO 49164 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-08-07T13:54:03Z INFO 49164 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-08-07T13:54:03Z INFO 49164 [LayerSpiller]: LayerSpill: Done. +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: Start split live ranges Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: Num_Splits: 0 +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: End split live ranges Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: Strt remove redundncies Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: remove_redundant_memsets +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.067 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 479mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: remove_redundant_memsets: 0 +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: remove_redundant_loads +2025-08-07T13:54:03Z INFO 49164 (sg01) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:03Z INFO 49164 (sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: remove_redundant_loads: 0 +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: End remove redundncies Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: Start DCE Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.027 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 479mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 479mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running post_sched +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 [post_scheduler]: Start PosT ScheD 3 sunda Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: End DCE Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49164 [post_scheduler]: Time-aware hwm post-sched +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: Start build flow dependencies Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49164 (sg02) [build_flow_deps]: Start build fdeps. Invocation: 6Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49164 (sg02) [build_flow_deps]: Allocs: 11038 instructions: 54605 +2025-08-07T13:54:03Z INFO 49164 [post_scheduler]: Time-aware simulation time: 93217495 +2025-08-07T13:54:03Z INFO 49164 [post_scheduler]: Done PosT ScheD Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: post_sched finished after 0.246 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 488mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running expand_scheduling_units +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: expand_scheduling_units finished after 0.001 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 488mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 178 PSUM Banks +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 173 PSUM Banks +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 18 PSUM Banks +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: SB Rotation rotated 10 Sb address +2025-08-07T13:54:03Z INFO 49164 (sg02) [build_flow_deps]: Build fdeps inserted 186325 edges +2025-08-07T13:54:03Z INFO 49164 (sg02) [build_flow_deps]: Done build fdeps 186325 Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: End build flow dependencies Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: Start remove useless insts Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: remove_useless_insts +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: SB Rotation rotated 307 Sb address +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: remove Useless Instructions: 0 +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: End remove useless insts Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: Start scratchpad optimization Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: SB Rotation rotated 17 Sb address +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: SB Rotation rotated 11 Sb address +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: End scratchpad optimization Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: SB Rotation rotated 129 Sb address +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:03Z INFO 49164 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: address_rotation_sb finished after 0.170 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 488mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 (sg01) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:03Z INFO 49164 (sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:03Z INFO 49164 (sg01) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.045 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 488mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 (sg01) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:03Z INFO 49164 (sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-08-07T13:54:03Z INFO 49164 (sg01) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.009 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 488mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running dep_opt +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 (sg01) [build_flow_deps]: Start build fdeps. Invocation: 7Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49164 (sg01) [build_flow_deps]: Allocs: 2192 instructions: 10052 +2025-08-07T13:54:03Z INFO 49164 (sg02) [PreSched]: DONE PRE scheduling Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z USER 49164 (sg02) [ModuleForkPass]: pre_sched finished after 0.580 seconds +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 488mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11038 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:03Z USER 49164 (sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=11038 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:03Z INFO 49164 (sg01) [build_flow_deps]: Build fdeps inserted 28527 edges +2025-08-07T13:54:03Z INFO 49164 (sg01) [build_flow_deps]: Done build fdeps 28527 Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: dep_opt finished after 0.034 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 488mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: Running report_stats +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 (sg01) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 12582912 │ +│ DMACopy │ Internal -> ExternalOutput │ 32 │ 33554432 │ +│ DMACopy │ Internal -> Output │ 1 │ 8388608 │ +│ Load │ Const -> Internal │ 2 │ 49152 │ +│ Load │ ExternalInput -> Internal │ 1844 │ 192954880 │ +│ Load │ Input -> Internal │ 5 │ 786432 │ +│ Load │ Internal │ 19 │ 9699328 │ +│ Save │ Internal │ 35 │ 9699328 │ +│ Save │ Internal -> Output │ 9 │ 4194306 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-08-07T13:54:03Z INFO 49164 (sg01) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 3 │ +│ 64 │ 2 │ +│ 128 │ 1 │ +│ 256 │ 1537 │ +│ 1024 │ 4 │ +│ 2048 │ 99 │ +│ 4096 │ 268 │ +│ 262144 │ 32 │ +│ 4194304 │ 5 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:03Z INFO 49164 (sg01) [ReportStats]: MM Stats: #MatMults 6780 #MatMult-Transposes 680 +2025-08-07T13:54:03Z INFO 49164 (sg01) [ReportStats]: IO Tensor size combined: 197149188 +2025-08-07T13:54:03Z INFO 49164 (sg01) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input87 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input84 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input85 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input88 │ ExternalInput │ bfloat16 │ 16777216 │ +│ input94 │ ExternalInput │ bfloat16 │ 16777216 │ +│ input92 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input89 │ ExternalInput │ bfloat16 │ 4194304 │ +│ output4 │ ExternalOutput │ bfloat16 │ 1048576 │ +│ input7 │ ExternalInput │ bfloat16 │ 1048576 │ +│ input6 │ ExternalInput │ bfloat16 │ 1048576 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-08-07T13:54:03Z INFO 49164 (sg01) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate1 │ Input │ bfloat16 │ 4194304 │ +│ add.4 │ Internal │ bfloat16 │ 4194304 │ +│ dot.7-buffer-1676 │ Internal │ bfloat16 │ 4194304 │ +│ intermediate6 │ Output │ bfloat16 │ 4194304 │ +│ intermediate4 │ Input │ bfloat16 │ 4194304 │ +│ intermediate7 │ Output │ bfloat16 │ 4194304 │ +│ all_reduce.1-buffer-1678 │ Internal │ bfloat16 │ 4194304 │ +│ intermediate7-buffer-1683 │ Internal │ bfloat16 │ 4194304 │ +│ dot.11-buffer-1681 │ Internal │ bfloat16 │ 4194304 │ +│ input89_local_981_i0 │ Internal │ bfloat16 │ 2097152 │ +└───────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-08-07T13:54:03Z USER 49164 (sg01) [ModuleForkPass]: report_stats finished after 0.003 seconds +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 488mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:03Z INFO 49164 (sg02) [TensorCopyElim]: Tensor CP elimination: 1 +2025-08-07T13:54:03Z INFO 49164 (sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:03Z INFO 49164 (sg02) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49164 (sg02) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49164 (sg02) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z USER 49164 (sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.213 seconds +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 488mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11037 memory location(s), 1 block(s), and 54604 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:03Z USER 49164 (sg02) [ModuleForkPass]: Running dynamic_dma_setup +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=11037 blocks=1 instructions=54604 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:03Z USER 49164 (sg02) [ModuleForkPass]: dynamic_dma_setup finished after 0.001 seconds +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 488mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11038 memory location(s), 1 block(s), and 54604 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:03Z USER 49164 (sg02) [ModuleForkPass]: Running runtime_memory_reservation +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=11038 blocks=1 instructions=54604 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:03Z USER 49164 (sg02) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 488mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11038 memory location(s), 1 block(s), and 54604 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:03Z USER 49164 (sg02) [ModuleForkPass]: Running coloring_allocator_psum +2025-08-07T13:54:03Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=11038 blocks=1 instructions=54604 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:03Z INFO 49164 (sg02) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:03Z INFO 49164 (sg02) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: allocating PSUM +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: main loop +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: renumber locations +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: size = 6171 +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: build_no_bitmap start +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: 112% PSUM demand before spilling +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: PSUM high-water mark = 9 tensors +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: found 17034 edges +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: mean: 5.52066 +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: median: 6.99747 +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: adjacency vectors require 136272 bytes +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: build_no_bitmap done +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: find costs +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: simplify interference graph +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: initialize low and high +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: lo = 6154 +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: hi = 15 +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: inf = 2 +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: total = 6171 +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: simplify +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: new candidates = 1 +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: select ranges +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: PSUM spills = 1 tensors +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: PSUM score = 874 (lower is better) +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: best PSUM heuristic = 0 +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: collect spills +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: insert spills +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: main loop +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: renumber locations +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: size = 6171 +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: build_no_bitmap start +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: found 17018 edges +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: mean: 5.51548 +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: median: 6.99662 +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: adjacency vectors require 136144 bytes +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: build_no_bitmap done +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: find costs +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: simplify interference graph +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: initialize low and high +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: lo = 6171 +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: hi = 0 +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: inf = 0 +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: total = 6171 +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: simplify +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: new candidates = 0 +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: select ranges +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: no more spills +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: spilling from PSUM cost about 874 cycles +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: number of tensors spilled from PSUM = 1 +2025-08-07T13:54:04Z INFO 49164 (sg02) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-08-07T13:54:04Z USER 49164 (sg02) [ModuleForkPass]: coloring_allocator_psum finished after 0.546 seconds +2025-08-07T13:54:04Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 489mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11039 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:04Z USER 49164 (sg02) [ModuleForkPass]: Running dma_optimization_psum +2025-08-07T13:54:04Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=11039 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:04Z INFO 49164 (sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:04Z INFO 49164 (sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:04Z USER 49164 (sg02) [ModuleForkPass]: dma_optimization_psum finished after 0.054 seconds +2025-08-07T13:54:04Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 489mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11039 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:04Z USER 49164 (sg02) [ModuleForkPass]: Running address_rotation_psum +2025-08-07T13:54:04Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=11039 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:04Z INFO 49164 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-08-07T13:54:04Z INFO 49164 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 2 PSUM Banks +2025-08-07T13:54:04Z INFO 49164 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 1 PSUM Banks +2025-08-07T13:54:04Z USER 49164 (sg02) [ModuleForkPass]: address_rotation_psum finished after 0.416 seconds +2025-08-07T13:54:04Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 490mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11039 memory location(s), 1 block(s), and 54605 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:04Z USER 49164 (sg02) [ModuleForkPass]: Running coloring_allocator_sb +2025-08-07T13:54:04Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=11039 blocks=1 instructions=54605 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:05Z INFO 49164 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 782085918 +2025-08-07T13:54:05Z INFO 49164 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2080 bytes +2025-08-07T13:54:05Z INFO 49164 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 8705802 +2025-08-07T13:54:05Z INFO 49164 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2212 bytes +2025-08-07T13:54:05Z INFO 49164 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 8196 +2025-08-07T13:54:05Z INFO 49164 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 248 bytes +2025-08-07T13:54:05Z INFO 49164 (sg02) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:05Z INFO 49164 (sg02) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: allocating SB +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: main loop +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: renumber locations +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: size = 4830 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: find partners +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: found 6167 accumulation groups +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: largest = _dot.256-t856_i28 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: tensors = 96 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: requires 98304 bytes/partition +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: expanding partners +2025-08-07T13:54:05Z INFO 49164 []: find first defs for local +2025-08-07T13:54:05Z INFO 49164 []: find first defs for global +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: find loads +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: 1 pin count +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: 1412 remat count +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: build interference graph +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: pass 1 int-tree +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Num intervals 4830 Num locations 4830 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: info.neighbors init Done +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: edge: 89632 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: mean: 37.1147 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: median: 30.5638 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: find costs +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: simplify interference graph +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: safe = 3832 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: unsafe = 244 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: inf = 753 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: total = 4829 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: simplify +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 236 #Pinned 0 #Safe 0 minCost 0.00392174 maxCost 0.865684 locations 4830 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: new candidates = 99 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: select ranges +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Total: 4829 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Spilled: 0.003 (13) +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Allocated: 0.997 (4816) +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Rover zone: 0.890 (4287) +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Pre-rover zone: 0.005 (24) +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Post-rover zone: 0.104 (501) +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Slice zone: 0.001 (4) +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Blocks nothing: 0.042 (203) +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Blocks medium: 0.002 (9) +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.715 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Visited until medium blocking (median): 0.790 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.818 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Blocks tall: 0.956 (4604) +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.847 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Visited until tall blocking (median): 0.999 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Success +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: SB spills = 13 tensors +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: size = 13312 bytes/partition +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: remats = 0 tensors +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: SB score = 46306 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: best SB heuristic = 0 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: collect spills +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: insert spills +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: locationsToDelete done +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: main loop +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: renumber locations +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: size = 4843 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: find partners +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: found 6167 accumulation groups +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: largest = _dot.256-t856_i28 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: tensors = 96 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: requires 98304 bytes/partition +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: expanding partners +2025-08-07T13:54:05Z INFO 49164 []: find first defs for local +2025-08-07T13:54:05Z INFO 49164 []: find first defs for global +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: find loads +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: 1 pin count +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: 1425 remat count +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: build interference graph +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: pass 1 int-tree +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Num intervals 4843 Num locations 4843 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: info.neighbors init Done +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: edge: 88300 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: mean: 36.465 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: median: 31.4367 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: find costs +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: simplify interference graph +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: safe = 1 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: unsafe = 0 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: inf = 25 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: total = 26 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: simplify +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 4843 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: new candidates = 0 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: (including 25 infinite cost tensors) +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: select ranges +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Total: 26 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Spilled: 0.000 (0) +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Allocated: 1.000 (26) +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Rover zone: 0.500 (13) +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Pre-rover zone: 0.000 (0) +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Post-rover zone: 0.500 (13) +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Slice zone: 0.000 (0) +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Blocks tall: 1.000 (26) +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:54:05Z INFO 49164 (sg02) [SB_Allocator]: Success +2025-08-07T13:54:29Z INFO 49164 (sg02) [SB_Allocator]: SB spills = 0 tensors +2025-08-07T13:54:29Z INFO 49164 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:29Z INFO 49164 (sg02) [SB_Allocator]: remats = 0 tensors +2025-08-07T13:54:29Z INFO 49164 (sg02) [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:54:29Z INFO 49164 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:29Z INFO 49164 (sg02) [SB_Allocator]: SB score = 0 +2025-08-07T13:54:29Z INFO 49164 (sg02) [SB_Allocator]: spilling from SB cost about 46306 cycles +2025-08-07T13:54:29Z INFO 49164 (sg02) [SB_Allocator]: number of tensors spilled from SB = 13 +2025-08-07T13:54:29Z INFO 49164 (sg02) [SB_Allocator]: total size of spilled tensors = 13312 bytes/partition +2025-08-07T13:54:29Z INFO 49164 (sg02) [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-08-07T13:54:29Z INFO 49164 (sg02) [SB_Allocator]: pinning saved approximately 9010 cycles +2025-08-07T13:54:29Z INFO 49164 (sg02) [SB_Allocator]: 0% SB utilization after allocation +2025-08-07T13:54:30Z INFO 49164 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 783789854 +2025-08-07T13:54:30Z INFO 49164 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2075 bytes +2025-08-07T13:54:30Z INFO 49164 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 10409738 +2025-08-07T13:54:30Z INFO 49164 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 1859 bytes +2025-08-07T13:54:30Z INFO 49164 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 8196 +2025-08-07T13:54:30Z INFO 49164 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 248 bytes +2025-08-07T13:54:30Z USER 49164 (sg02) [ModuleForkPass]: coloring_allocator_sb finished after 25.045 seconds +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 495mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11065 memory location(s), 1 block(s), and 54631 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:30Z USER 49164 (sg02) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=11065 blocks=1 instructions=54631 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:30Z USER 49164 (sg02) [ModuleForkPass]: address_rotation_sb finished after 0.092 seconds +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 495mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11065 memory location(s), 1 block(s), and 54631 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:30Z USER 49164 (sg02) [ModuleForkPass]: Running dma_optimization_sb +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=11065 blocks=1 instructions=54631 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 794199592, 97.3783% input load, 5.03652e-07% output write, 2.6217% spill/reload [sg0002] +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: removed 0 identical load +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(7.73378e+08) +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload instructions +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload memory locations +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 4100, 0.0196912% out of total spill/reload dma traffic +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 24 SpillSaves and Reloads +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: average loaded DMA size 2079 bytes +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: average saved DMA size 2155 bytes +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 12 SpillSaves and Reloads +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: average loaded DMA size 2081 bytes +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: average saved DMA size 2341 bytes +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 2 combined 0 SpillSaves and Reloads +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: average loaded DMA size 2081 bytes +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: average saved DMA size 2341 bytes +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 783787804 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 2081 bytes +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 10407688 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2341 bytes +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 4100, 0.000516243% out of total dma traffic +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 794195492, 97.3788% input load, 5.03654e-07% output write, 2.6212% spill/reload [sg0002] +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 783787804 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 2081 bytes +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 10407688 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2341 bytes +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 8196 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 248 bytes +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 2083 bytes +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-08-07T13:54:30Z USER 49164 (sg02) [ModuleForkPass]: dma_optimization_sb finished after 0.421 seconds +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 501mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54607 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:30Z USER 49164 (sg02) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=11031 blocks=1 instructions=54607 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: SB Rotation rotated 309 Sb address +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: SB Rotation rotated 581 Sb address +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: SB Rotation rotated 189 Sb address +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: SB Rotation rotated 7 Sb address +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: SB Rotation rotated 272 Sb address +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:30Z USER 49164 (sg02) [ModuleForkPass]: address_rotation_sb finished after 0.296 seconds +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 501mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54607 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:30Z USER 49164 (sg02) [ModuleForkPass]: Running coloring_allocator_dram +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=11031 blocks=1 instructions=54607 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:30Z INFO 49164 (sg02) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:30Z INFO 49164 (sg02) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: reserved space = 781766682 bytes +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: spill space = 18800388 bytes +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: aligned spill space = 18841600 bytes +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: renumber locations +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: size = 22 +2025-08-07T13:54:30Z INFO 49164 []: find first defs for local +2025-08-07T13:54:30Z INFO 49164 []: find first defs for global +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: Num intervals 22 Num locations 22 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: simplify interference graph +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: initialize low and high +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: lo = 22 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: hi = 0 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: total = 22 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: simplify +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: select ranges +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: allreduce_dram_hwm 8404992 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: Real CC buffer size 8404992 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: DRAM hwm after allocation: 12599296 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:54:30Z USER 49164 (sg02) [ModuleForkPass]: coloring_allocator_dram finished after 0.084 seconds +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 502mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54607 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:30Z USER 49164 (sg02) [ModuleForkPass]: Running address_rotation_dram +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=11031 blocks=1 instructions=54607 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: Runtime page size at 512MB +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: DRAM hwm before rotation 12599296 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: allreduce hwm 8404992 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: Real CC buffer size 8404992 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: DRAM hwm after rotation 12599296 +2025-08-07T13:54:30Z INFO 49164 (sg02) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-08-07T13:54:30Z USER 49164 (sg02) [ModuleForkPass]: address_rotation_dram finished after 0.038 seconds +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 502mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54607 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:30Z USER 49164 (sg02) [ModuleForkPass]: Running tensorcopy_accel +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=11031 blocks=1 instructions=54607 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:30Z INFO 49164 (sg02) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-08-07T13:54:30Z INFO 49164 (sg02) [TensorCopyAccel::Impl]: Accelerated 0 out of 6133 tensorcopy in Function: sg0002 average acceleration factor: -nan +2025-08-07T13:54:30Z USER 49164 (sg02) [ModuleForkPass]: tensorcopy_accel finished after 0.005 seconds +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 502mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54607 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:30Z USER 49164 (sg02) [ModuleForkPass]: Running peephole_opts +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=11031 blocks=1 instructions=54607 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:30Z INFO 49164 (sg02) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-08-07T13:54:30Z USER 49164 (sg02) [ModuleForkPass]: peephole_opts finished after 0.016 seconds +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 502mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:30Z USER 49164 (sg02) [ModuleForkPass]: Running lower_kernel +2025-08-07T13:54:30Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:30Z INFO 49164 (sg02) [LowerKernel]: Started running LowerKernel +2025-08-07T13:54:30Z INFO 49164 (sg02) [LowerKernel]: Start of kernel lowering pass, number of insts: 54610, number of allocs: 11031 +2025-08-07T13:54:31Z INFO 49164 (sg02) [LowerKernel]: Scan BKs time (s): 0.003475 +2025-08-07T13:54:31Z INFO 49164 (sg02) [LowerKernel]: Lower BKs time (s): 9e-06 +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: lower_kernel finished after 0.004 seconds +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 502mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.004 seconds +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 502mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.007 seconds +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 502mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: birverifier finished after 0.045 seconds +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 502mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: Running dynamic_dma_scan +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: dynamic_dma_scan finished after 0.007 seconds +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 502mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: Running build_fdeps +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z INFO 49164 (sg02) [build_flow_deps]: Start build fdeps. Invocation: 8Thu Aug 7 13:54:31 2025 +2025-08-07T13:54:31Z INFO 49164 (sg02) [build_flow_deps]: Allocs: 11031 instructions: 54610 +2025-08-07T13:54:31Z INFO 49164 (sg02) [build_flow_deps]: Build fdeps inserted 186340 edges +2025-08-07T13:54:31Z INFO 49164 (sg02) [build_flow_deps]: Done build fdeps 186340 Thu Aug 7 13:54:31 2025 +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: build_fdeps finished after 0.158 seconds +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 502mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: Running remove_redundancies +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z INFO 49164 (sg02) [RemoveRedundancies]: remove_clobbered_writes +2025-08-07T13:54:31Z INFO 49164 (sg02) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-08-07T13:54:31Z INFO 49164 (sg02) [RemoveRedundancies]: remove_useless_insts +2025-08-07T13:54:31Z INFO 49164 (sg02) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: remove_redundancies finished after 0.023 seconds +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 502mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z INFO 49164 (sg02) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:31Z INFO 49164 (sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:31Z INFO 49164 (sg02) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.268 seconds +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 534mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z INFO 49164 (sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:31Z INFO 49164 (sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.073 seconds +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 535mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 535mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z USER 49164 (sg02) [ModuleForkPass]: Running post_sched +2025-08-07T13:54:31Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:31Z INFO 49164 [post_scheduler]: Start PosT ScheD 3 sunda Thu Aug 7 13:54:31 2025 +2025-08-07T13:54:32Z INFO 49164 [post_scheduler]: Time-aware hwm post-sched +2025-08-07T13:54:33Z INFO 49164 [post_scheduler]: Time-aware simulation time: 7164944 +2025-08-07T13:54:33Z INFO 49164 [post_scheduler]: Done PosT ScheD Thu Aug 7 13:54:33 2025 +2025-08-07T13:54:33Z USER 49164 (sg02) [ModuleForkPass]: post_sched finished after 1.700 seconds +2025-08-07T13:54:33Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 574mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:33Z USER 49164 (sg02) [ModuleForkPass]: Running expand_scheduling_units +2025-08-07T13:54:33Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:33Z USER 49164 (sg02) [ModuleForkPass]: expand_scheduling_units finished after 0.006 seconds +2025-08-07T13:54:33Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 546mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:33Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:33Z USER 49164 (sg02) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:33Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:33Z INFO 49164 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 3115 PSUM Banks +2025-08-07T13:54:33Z INFO 49164 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 5027 PSUM Banks +2025-08-07T13:54:33Z INFO 49164 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 2 PSUM Banks +2025-08-07T13:54:33Z INFO 49164 (sg02) [DMAOptimizationBase]: SB Rotation rotated 15 Sb address +2025-08-07T13:54:33Z INFO 49164 (sg02) [DMAOptimizationBase]: SB Rotation rotated 262 Sb address +2025-08-07T13:54:34Z INFO 49164 (sg02) [DMAOptimizationBase]: SB Rotation rotated 25 Sb address +2025-08-07T13:54:34Z INFO 49164 (sg02) [DMAOptimizationBase]: SB Rotation rotated 13 Sb address +2025-08-07T13:54:34Z INFO 49164 (sg02) [DMAOptimizationBase]: SB Rotation rotated 120 Sb address +2025-08-07T13:54:34Z INFO 49164 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:34Z INFO 49164 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:34Z USER 49164 (sg02) [ModuleForkPass]: address_rotation_sb finished after 0.965 seconds +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 550mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 (sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z INFO 49164 (sg02) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:34Z INFO 49164 (sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:34Z INFO 49164 (sg02) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:34Z USER 49164 (sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.170 seconds +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 565mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 (sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z INFO 49164 (sg02) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:34Z INFO 49164 (sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-08-07T13:54:34Z INFO 49164 (sg02) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:34Z USER 49164 (sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.036 seconds +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 536mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 (sg02) [ModuleForkPass]: Running dep_opt +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z INFO 49164 (sg02) [build_flow_deps]: Start build fdeps. Invocation: 9Thu Aug 7 13:54:34 2025 +2025-08-07T13:54:34Z INFO 49164 (sg02) [build_flow_deps]: Allocs: 11031 instructions: 54610 +2025-08-07T13:54:34Z INFO 49164 (sg02) [build_flow_deps]: Build fdeps inserted 182805 edges +2025-08-07T13:54:34Z INFO 49164 (sg02) [build_flow_deps]: Done build fdeps 182805 Thu Aug 7 13:54:34 2025 +2025-08-07T13:54:34Z USER 49164 (sg02) [ModuleForkPass]: dep_opt finished after 0.212 seconds +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 540mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 (sg02) [ModuleForkPass]: Running report_stats +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z INFO 49164 (sg02) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 12582912 │ +│ DMACopy │ Internal │ 1 │ 4194304 │ +│ Load │ Const -> Internal │ 4 │ 34824 │ +│ Load │ ExternalInput -> Internal │ 2922 │ 773343244 │ +│ Load │ Internal │ 37 │ 10409736 │ +│ Save │ Internal │ 633 │ 10407684 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-08-07T13:54:34Z INFO 49164 (sg02) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 64 │ 2 │ +│ 256 │ 1538 │ +│ 512 │ 593 │ +│ 1024 │ 16 │ +│ 2048 │ 19 │ +│ 4096 │ 1410 │ +│ 60768 │ 1 │ +│ 60776 │ 4 │ +│ 4194304 │ 3 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:34Z INFO 49164 (sg02) [ReportStats]: MM Stats: #MatMults 43667 #MatMult-Transposes 19987 +2025-08-07T13:54:34Z INFO 49164 (sg02) [ReportStats]: IO Tensor size combined: 773343248 +2025-08-07T13:54:34Z INFO 49164 (sg02) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input473 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input469 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input472 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input470 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input474 │ ExternalInput │ bfloat16 │ 8192 │ +│ input471 │ ExternalInput │ bfloat16 │ 8192 │ +│ input1 │ ExternalInput │ int32 │ 2048 │ +│ input3 │ ExternalInput │ float32 │ 12 │ +│ output0 │ ExternalOutput │ int32 │ 4 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-08-07T13:54:34Z INFO 49164 (sg02) [ReportStats]: Large (Internal) Tensor Statistics: +┌──────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├──────────────────────────┼──────────┼──────────┼──────────────┤ +│ all_reduce.3-buffer-2764 │ Internal │ bfloat16 │ 4194304 │ +│ dot.14-buffer-2762 │ Internal │ bfloat16 │ 4194304 │ +│ intermediate109 │ Input │ bfloat16 │ 4194304 │ +│ convert.57 │ Internal │ bfloat16 │ 4194304 │ +│ add.9 │ Internal │ bfloat16 │ 4194304 │ +│ intermediate108 │ Input │ bfloat16 │ 4194304 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ -t2787 │ Internal │ float32 │ 1048576 │ +│ -t2781 │ Internal │ float32 │ 1048576 │ +│ -t2776 │ Internal │ float32 │ 1048576 │ +└──────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-08-07T13:54:34Z USER 49164 (sg02) [ModuleForkPass]: report_stats finished after 0.013 seconds +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 535mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:34Z USER 49164 [BackendPassManager]: mod_parallel_pass finished after 32.110 seconds +2025-08-07T13:54:34Z INFO 49164 [BackendPassManager]: curr_vmrss: 535mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 [BackendPassManager]: Output has 3 module(s), 3 function(s), 14992 memory location(s), 3 block(s), and 68400 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 [BackendPassManager]: Running assign_trigger_engine +2025-08-07T13:54:34Z INFO 49164 [BackendPassManager]: Inputs to assign_trigger_engine: modules=3 functions=3 allocs=14992 blocks=3 instructions=68400 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z INFO 49164 (sg00) [AssignTriggerEngine]: Assigned trigger engine for 81 DMA instructions. Moved 49 DMA instructions to CC's engines. +2025-08-07T13:54:34Z INFO 49164 (sg01) [AssignTriggerEngine]: Assigned trigger engine for 44 DMA instructions. Moved 9 DMA instructions to CC's engines. +2025-08-07T13:54:34Z INFO 49164 (sg02) [AssignTriggerEngine]: Assigned trigger engine for 644 DMA instructions. Moved 11 DMA instructions to CC's engines. +2025-08-07T13:54:34Z USER 49164 [BackendPassManager]: assign_trigger_engine finished after 0.057 seconds +2025-08-07T13:54:34Z INFO 49164 [BackendPassManager]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 [BackendPassManager]: Output has 3 module(s), 3 function(s), 14992 memory location(s), 3 block(s), and 68400 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:34Z INFO 49164 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=14992 blocks=3 instructions=68400 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 (sg00) [SubgraphForkPass]: Running lower_local_collectives +2025-08-07T13:54:34Z INFO 49164 (sg00) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:34Z USER 49164 (sg00) [SubgraphForkPass]: lower_local_collectives finished after 0.000 seconds +2025-08-07T13:54:34Z INFO 49164 (sg00) [SubgraphForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:34Z USER 49164 (sg01) [SubgraphForkPass]: Running lower_local_collectives +2025-08-07T13:54:34Z USER 49164 (sg00) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-08-07T13:54:34Z USER 49164 (sg02) [SubgraphForkPass]: Running lower_local_collectives +2025-08-07T13:54:34Z INFO 49164 (sg00) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:34Z USER 49164 (sg00) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.000 seconds +2025-08-07T13:54:34Z INFO 49164 (sg01) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:34Z INFO 49164 (sg00) [SubgraphForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z USER 49164 (sg01) [SubgraphForkPass]: lower_local_collectives finished after 0.000 seconds +2025-08-07T13:54:34Z INFO 49164 (sg01) [SubgraphForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:34Z USER 49164 (sg00) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:34Z INFO 49164 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:34Z USER 49164 (sg01) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-08-07T13:54:34Z INFO 49164 (sg00) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:34Z INFO 49164 (sg01) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:34Z USER 49164 (sg01) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.000 seconds +2025-08-07T13:54:34Z INFO 49164 (sg01) [SubgraphForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:34Z USER 49164 (sg01) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:34Z INFO 49164 (sg02) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 (sg02) [SubgraphForkPass]: lower_local_collectives finished after 0.000 seconds +2025-08-07T13:54:34Z INFO 49164 (sg01) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:34Z INFO 49164 (sg02) [SubgraphForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 (sg02) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-08-07T13:54:34Z INFO 49164 (sg02) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 (sg02) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.000 seconds +2025-08-07T13:54:34Z INFO 49164 (sg02) [SubgraphForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 (sg02) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:34Z INFO 49164 (sg02) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z INFO 49164 (sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:34Z USER 49164 (sg00) [SubgraphForkPass]: dead_code_elim finished after 0.011 seconds +2025-08-07T13:54:34Z INFO 49164 (sg00) [SubgraphForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:34Z INFO 49164 (sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:34Z INFO 49164 (sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:34Z USER 49164 (sg01) [SubgraphForkPass]: dead_code_elim finished after 0.041 seconds +2025-08-07T13:54:34Z INFO 49164 (sg01) [SubgraphForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:34Z USER 49164 (sg02) [SubgraphForkPass]: dead_code_elim finished after 0.062 seconds +2025-08-07T13:54:34Z INFO 49164 (sg02) [SubgraphForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:34Z USER 49164 [BackendPassManager]: subgraph_parallel_pass finished after 0.066 seconds +2025-08-07T13:54:34Z INFO 49164 [BackendPassManager]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 [BackendPassManager]: Output has 3 module(s), 3 function(s), 14992 memory location(s), 3 block(s), and 68400 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 [BackendPassManager]: Running assign_hwdge_engine +2025-08-07T13:54:34Z INFO 49164 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=3 functions=3 allocs=14992 blocks=3 instructions=68400 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 [BackendPassManager]: assign_hwdge_engine finished after 0.008 seconds +2025-08-07T13:54:34Z INFO 49164 [BackendPassManager]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 [BackendPassManager]: Output has 3 module(s), 3 function(s), 14992 memory location(s), 3 block(s), and 68400 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:34Z INFO 49164 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=14992 blocks=3 instructions=68400 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 (sg00) [ModuleForkPass]: Running alloc_queues +2025-08-07T13:54:34Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:34Z INFO 49164 (sg00) [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-08-07T13:54:34Z USER 49164 (sg01) [ModuleForkPass]: Running alloc_queues +2025-08-07T13:54:34Z USER 49164 (sg02) [ModuleForkPass]: Running alloc_queues +2025-08-07T13:54:34Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:34Z INFO 49164 (sg01) [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z INFO 49164 (sg02) [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-08-07T13:54:34Z INFO 49164 (sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 2 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 3 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 32 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 48 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 163 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-08-07T13:54:34Z USER 49164 (sg00) [ModuleForkPass]: alloc_queues finished after 0.001 seconds +2025-08-07T13:54:34Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:34Z USER 49164 (sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-08-07T13:54:34Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:34Z USER 49164 (sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-08-07T13:54:34Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:34Z USER 49164 (sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-08-07T13:54:34Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:34Z USER 49164 (sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-08-07T13:54:34Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:34Z USER 49164 (sg00) [ModuleForkPass]: Running lower_control +2025-08-07T13:54:34Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:34Z INFO 49164 (sg01) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 1 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 13 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 35 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 8 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 1890 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-08-07T13:54:34Z USER 49164 (sg01) [ModuleForkPass]: alloc_queues finished after 0.002 seconds +2025-08-07T13:54:34Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:34Z USER 49164 (sg01) [ModuleForkPass]: Running chain_dma_transposes +2025-08-07T13:54:34Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:34Z USER 49164 (sg01) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-08-07T13:54:34Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:34Z USER 49164 (sg01) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-08-07T13:54:34Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:34Z USER 49164 (sg01) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-08-07T13:54:34Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:34Z USER 49164 (sg01) [ModuleForkPass]: Running lower_control +2025-08-07T13:54:34Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:34Z INFO 49164 (sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-08-07T13:54:34Z USER 49164 (sg00) [ModuleForkPass]: lower_control finished after 0.005 seconds +2025-08-07T13:54:34Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:34Z USER 49164 (sg00) [ModuleForkPass]: Running dep_reduction +2025-08-07T13:54:34Z INFO 49164 (sg00) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=1769 blocks=1 instructions=3738 Max writers: 32 Max Readers: 512 +2025-08-07T13:54:34Z INFO 49164 (sg00) [DepReduction]: Start Dependency Reduction +2025-08-07T13:54:34Z INFO 49164 (sg02) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 5 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 30 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 626 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 14 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 4 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 2919 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-08-07T13:54:34Z USER 49164 (sg02) [ModuleForkPass]: alloc_queues finished after 0.007 seconds +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg00) [DepReduction]: Processing async instrs... +2025-08-07T13:54:34Z INFO 49164 (sg00) [DepReduction]: Processing secondary edges per engine... +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 (sg02) [ModuleForkPass]: Running chain_dma_transposes +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 (sg02) [ModuleForkPass]: chain_dma_transposes finished after 0.001 seconds +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 (sg02) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 (sg02) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 3255 +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 (sg02) [ModuleForkPass]: Running lower_control +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z INFO 49164 (sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 3426 +2025-08-07T13:54:34Z INFO 49164 (sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 3426 +2025-08-07T13:54:34Z INFO 49164 (sg01) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-08-07T13:54:34Z USER 49164 (sg01) [ModuleForkPass]: lower_control finished after 0.013 seconds +2025-08-07T13:54:34Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:34Z USER 49164 (sg01) [ModuleForkPass]: Running dep_reduction +2025-08-07T13:54:34Z INFO 49164 (sg01) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=2192 blocks=1 instructions=10052 Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:34Z INFO 49164 (sg01) [DepReduction]: Start Dependency Reduction +2025-08-07T13:54:34Z INFO 49164 (sg01) [DepReduction]: Processing async instrs... +2025-08-07T13:54:34Z INFO 49164 (sg01) [DepReduction]: Processing secondary edges per engine... +2025-08-07T13:54:34Z INFO 49164 (sg01) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 11343 +2025-08-07T13:54:34Z INFO 49164 (sg00) [DepReduction]: Num Async removed: 0 +2025-08-07T13:54:34Z INFO 49164 (sg00) [DepReduction]: Finished dependency reduction: 19978 removed, new total 1703 +2025-08-07T13:54:34Z INFO 49164 (sg00) [DepReduction]: Finished Dependency Reduction +2025-08-07T13:54:34Z USER 49164 (sg00) [ModuleForkPass]: dep_reduction finished after 0.026 seconds +2025-08-07T13:54:34Z INFO 49164 (sg00) [ModuleForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1769 memory location(s), 1 block(s), and 3738 instruction(s). Max writers: 32 Max Readers: 512 +2025-08-07T13:54:34Z INFO 49164 (sg01) [DepReduction]: Processing redundant descendants, Done. Num edges removed 13217 +2025-08-07T13:54:34Z INFO 49164 (sg01) [DepReduction]: Processing async instrs, Done. Num edges removed 13217 +2025-08-07T13:54:34Z INFO 49164 (sg02) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-08-07T13:54:34Z USER 49164 (sg02) [ModuleForkPass]: lower_control finished after 0.066 seconds +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 537mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z USER 49164 (sg02) [ModuleForkPass]: Running dep_reduction +2025-08-07T13:54:34Z INFO 49164 (sg02) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=11031 blocks=1 instructions=54610 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:34Z INFO 49164 (sg02) [DepReduction]: Start Dependency Reduction +2025-08-07T13:54:34Z INFO 49164 (sg01) [DepReduction]: Num Async removed: 0 +2025-08-07T13:54:34Z INFO 49164 (sg01) [DepReduction]: Finished dependency reduction: 57690 removed, new total 3767 +2025-08-07T13:54:34Z INFO 49164 (sg01) [DepReduction]: Finished Dependency Reduction +2025-08-07T13:54:34Z USER 49164 (sg01) [ModuleForkPass]: dep_reduction finished after 0.076 seconds +2025-08-07T13:54:34Z INFO 49164 (sg01) [ModuleForkPass]: curr_vmrss: 538mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49164 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2192 memory location(s), 1 block(s), and 10052 instruction(s). Max writers: 48 Max Readers: 1536 +2025-08-07T13:54:34Z INFO 49164 (sg02) [DepReduction]: Processing async instrs... +2025-08-07T13:54:34Z INFO 49164 (sg02) [DepReduction]: Processing secondary edges per engine... +2025-08-07T13:54:35Z INFO 49164 (sg02) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 47937 +2025-08-07T13:54:35Z INFO 49164 (sg02) [DepReduction]: Processing redundant descendants, Done. Num edges removed 51558 +2025-08-07T13:54:35Z INFO 49164 (sg02) [DepReduction]: Processing async instrs, Done. Num edges removed 51558 +2025-08-07T13:54:35Z INFO 49164 (sg02) [DepReduction]: Num Async removed: 0 +2025-08-07T13:54:35Z INFO 49164 (sg02) [DepReduction]: Finished dependency reduction: 368768 removed, new total 17797 +2025-08-07T13:54:35Z INFO 49164 (sg02) [DepReduction]: Finished Dependency Reduction +2025-08-07T13:54:35Z USER 49164 (sg02) [ModuleForkPass]: dep_reduction finished after 0.715 seconds +2025-08-07T13:54:35Z INFO 49164 (sg02) [ModuleForkPass]: curr_vmrss: 566mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49164 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11031 memory location(s), 1 block(s), and 54610 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:35Z USER 49164 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:35Z USER 49164 [BackendPassManager]: mod_parallel_pass finished after 0.833 seconds +2025-08-07T13:54:35Z INFO 49164 [BackendPassManager]: curr_vmrss: 561mb, ru_maxrss: 599mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49164 [BackendPassManager]: Output has 3 module(s), 3 function(s), 14992 memory location(s), 3 block(s), and 68400 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:35Z USER 49164 [BackendPassManager]: Running nc_parallel_pass +2025-08-07T13:54:35Z INFO 49164 [BackendPassManager]: Inputs to nc_parallel_pass: modules=3 functions=3 allocs=14992 blocks=3 instructions=68400 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:35Z USER 49164 [CoreForkPass]: Running bir_linker +2025-08-07T13:54:35Z INFO 49164 [CoreForkPass]: Inputs to bir_linker: modules=3 functions=3 allocs=14992 blocks=3 instructions=68400 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:35Z INFO 49164 (sgLnk) [BirLinker]: bir_linker cwd: +2025-08-07T13:54:35Z INFO 49164 (sgLnk) [BirLinker]: Num intermediates 111 +2025-08-07T13:54:35Z INFO 49164 (sgLnk) [BirLinker]: Num Module Definitions 3 +2025-08-07T13:54:35Z INFO 49164 (sgLnk) [BirLinker]: Linking to a call-graph structure +2025-08-07T13:54:35Z INFO 49164 (sgLnk) [BirLinker]: Added a new SpillReload Que qPoolPIOParam0 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [BirLinker]: tensor_map verification successful. +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [BirLinker]: Writing updated tensor_map /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/neuronxcc-_bze9vv6/sgLnk/sg00/tensor_map.json +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [BirLinker]: PostLink Stats: #MatMults 282943 #MatMult-Transposes 44299 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [BirLinker]: Total Intermediate MMTs 4888 #out: 4608 #inp: 280 #symmetric: 0 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [BirLinker]: Total Intermediate IOs with MMTs: 38 #out: 36 #inp: 2 #both: 0 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [BirLinker]: releasing pre-link modules +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [BirLinker]: linking Done. +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: bir_linker finished after 0.813 seconds +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: curr_vmrss: 885mb, ru_maxrss: 885mb (delta=286mb) +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 68460 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: Running postlnk_dma_report +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Inputs to postlnk_dma_report: modules=1 functions=4 allocs=15651 blocks=4 instructions=68460 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DMAReport]: DMA Report: Bytes loaded or saved 1070883624, 94.2373% input load, 0.832294% output write, 4.93041% spill/reload +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: postlnk_dma_report finished after 0.008 seconds +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: curr_vmrss: 477mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 68460 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: Running report_stats +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Inputs to report_stats: modules=1 functions=4 allocs=15651 blocks=4 instructions=68460 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ReportStats]: Data Movement Statistics: main +┌─────────────┬──────┬───────┬───────┐ +│ Instruction │ Kind │ Count │ Bytes │ +└─────────────┴──────┴───────┴───────┘ + +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 5 │ 2489321472 │ +│ DMACopy │ Internal -> ExternalOutput │ 32 │ 33554432 │ +│ DMACopy │ Internal -> Output │ 1 │ 8388608 │ +│ Load │ Const -> Internal │ 3 │ 49408 │ +│ Load │ ExternalInput -> Internal │ 116 │ 41953792 │ +│ Load │ Internal │ 48 │ 6291456 │ +│ Save │ Internal │ 32 │ 6291456 │ +│ Save │ Internal -> Output │ 12 │ 4718594 │ +└─────────────┴────────────────────────────┴───────┴────────────┘ + +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 3 │ +│ 64 │ 1 │ +│ 128 │ 1 │ +│ 256 │ 2 │ +│ 512 │ 1 │ +│ 1024 │ 66 │ +│ 2048 │ 82 │ +│ 4096 │ 60 │ +│ 262144 │ 32 │ +│ 4194304 │ 2 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 12582912 │ +│ DMACopy │ Internal -> ExternalOutput │ 32 │ 33554432 │ +│ DMACopy │ Internal -> Output │ 1 │ 8388608 │ +│ Load │ Const -> Internal │ 2 │ 49152 │ +│ Load │ ExternalInput -> Internal │ 1844 │ 192954880 │ +│ Load │ Input -> Internal │ 5 │ 786432 │ +│ Load │ Internal │ 19 │ 9699328 │ +│ Save │ Internal │ 35 │ 9699328 │ +│ Save │ Internal -> Output │ 9 │ 4194306 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 3 │ +│ 64 │ 2 │ +│ 128 │ 1 │ +│ 256 │ 1537 │ +│ 1024 │ 4 │ +│ 2048 │ 99 │ +│ 4096 │ 268 │ +│ 262144 │ 32 │ +│ 4194304 │ 5 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 12582912 │ +│ DMACopy │ Internal │ 1 │ 4194304 │ +│ Load │ Const -> Internal │ 4 │ 34824 │ +│ Load │ ExternalInput -> Internal │ 2922 │ 773343244 │ +│ Load │ Internal │ 37 │ 10409736 │ +│ Save │ Internal │ 633 │ 10407684 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 64 │ 2 │ +│ 256 │ 1538 │ +│ 512 │ 593 │ +│ 1024 │ 16 │ +│ 2048 │ 19 │ +│ 4096 │ 1410 │ +│ 60768 │ 1 │ +│ 60776 │ 4 │ +│ 4194304 │ 3 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ReportStats]: MM Stats: #MatMults 52423 #MatMult-Transposes 21179 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ReportStats]: IO Tensor size combined: 9981015084 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input76_sg0000 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input473_sg0002 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input76 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input473 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input131 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input109 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input98 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input153 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input87 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input175 │ ExternalInput │ bfloat16 │ 50331648 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────┬───────────────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────┼───────────────────┼──────────┼──────────────┤ +│ intermediate1 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate4 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate18 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate9 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate15 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate12 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate27 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate24 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate21 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate6 │ InternalInterface │ bfloat16 │ 4194304 │ +└─────────────────┴───────────────────┴──────────┴──────────────┘ + +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: report_stats finished after 0.018 seconds +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: curr_vmrss: 477mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 68460 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: Running coloring_allocator_dram_post_lnk +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Inputs to coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=15651 blocks=4 instructions=68460 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Local +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: reserved space = 8342042644 bytes +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: spill space = 302514248 bytes +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: aligned spill space = 302661632 bytes +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: renumber locations +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: size = 111 +2025-08-07T13:54:36Z INFO 49164 []: find first defs for local +2025-08-07T13:54:36Z INFO 49164 []: find first defs for global +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: Num intervals 111 Num locations 111 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: simplify interference graph +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: initialize low and high +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: lo = 111 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: hi = 0 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: total = 111 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: simplify +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 20971520 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: select ranges +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 20971520 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: Real CC buffer size 20971520 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 42479616 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: coloring_allocator_dram_post_lnk finished after 0.047 seconds +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: curr_vmrss: 477mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 68460 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_post_lnk +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=15651 blocks=4 instructions=68460 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_post_lnk finished after 0.029 seconds +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: curr_vmrss: 477mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 68460 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: Running lower_dynamic_dma +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=4 allocs=15651 blocks=4 instructions=68460 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: lower_dynamic_dma finished after 0.010 seconds +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: curr_vmrss: 477mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 68460 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: Running legalize_dynamic_dma +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=4 allocs=15651 blocks=4 instructions=68460 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: legalize_dynamic_dma finished after 0.022 seconds +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: curr_vmrss: 477mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 68460 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: Running lower_dma +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Inputs to lower_dma: modules=1 functions=4 allocs=15651 blocks=4 instructions=68460 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 67892/67892 (100% DGE) + power-of-2 partition : 67930/67971 (99.9397% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 67930/67971 (99.9397% DGE) + Cast (DGE/DMA) + 128 partition : 145/145 (100% DGE) + power-of-2 partition : 145/146 (99.3151% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 145/146 (99.3151% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 0/2096 (0% DGE) + power-of-2 partition : 0/2717 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/2717 (0% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 36 + Transpose : 1 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 1156/1156 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: lower_dma finished after 0.078 seconds +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: curr_vmrss: 477mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 68460 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: Running expand_all_engine +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=4 allocs=15651 blocks=4 instructions=68460 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: expand_all_engine finished after 0.010 seconds +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: curr_vmrss: 477mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 68460 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: Running alloc_semaphores +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=4 allocs=15651 blocks=4 instructions=68460 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: alloc_semaphores finished after 0.053 seconds +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: curr_vmrss: 477mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 68460 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: Running expand_inst_late +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=4 allocs=15651 blocks=4 instructions=68460 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: expand_inst_late finished after 0.051 seconds +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: curr_vmrss: 477mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 68595 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: Running seq_inst_opt +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=4 allocs=15651 blocks=4 instructions=68595 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [SeqInstOpt]: Removing 97 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [SeqInstOpt]: Removing 31 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: seq_inst_opt finished after 0.008 seconds +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: curr_vmrss: 477mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 68467 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: Running lower_sync +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Inputs to lower_sync: modules=1 functions=4 allocs=15651 blocks=4 instructions=68467 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: lower_sync finished after 0.030 seconds +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: curr_vmrss: 477mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 73049 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: Running lower_act +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Inputs to lower_act: modules=1 functions=4 allocs=15651 blocks=4 instructions=73049 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: lower_act finished after 0.009 seconds +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: curr_vmrss: 477mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 73070 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z USER 49164 [CoreForkPass]: Running lower_dve +2025-08-07T13:54:36Z INFO 49164 [CoreForkPass]: Inputs to lower_dve: modules=1 functions=4 allocs=15651 blocks=4 instructions=73070 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:36Z INFO 49164 (sgLnk) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json +2025-08-07T13:54:37Z USER 49164 [CoreForkPass]: lower_dve finished after 0.076 seconds +2025-08-07T13:54:37Z INFO 49164 [CoreForkPass]: curr_vmrss: 484mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:37Z INFO 49164 [CoreForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 73070 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z USER 49164 [CoreForkPass]: Running lower_ap +2025-08-07T13:54:37Z INFO 49164 [CoreForkPass]: Inputs to lower_ap: modules=1 functions=4 allocs=15651 blocks=4 instructions=73070 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z USER 49164 [CoreForkPass]: lower_ap finished after 0.012 seconds +2025-08-07T13:54:37Z INFO 49164 [CoreForkPass]: curr_vmrss: 484mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:37Z INFO 49164 [CoreForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 73070 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z USER 49164 [CoreForkPass]: Running coloring_allocator_reg +2025-08-07T13:54:37Z INFO 49164 [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=4 allocs=15651 blocks=4 instructions=73070 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: allocating REG +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: allocating REG +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: renumber registers +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: size = 3 +2025-08-07T13:54:37Z INFO 49164 []: find first defs for local reg +2025-08-07T13:54:37Z INFO 49164 []: find first defs for global reg +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: live range analysis +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: find costs +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: simplify interference graph +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: initialize low and high +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: lo = 3 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: hi = 0 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: inf = 0 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: total = 3 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: simplify +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: new candidates = 0 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: select ranges +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: no more spills +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: allocating REG +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: renumber registers +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: size = 1 +2025-08-07T13:54:37Z INFO 49164 []: find first defs for local reg +2025-08-07T13:54:37Z INFO 49164 []: find first defs for global reg +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: live range analysis +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: find costs +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: simplify interference graph +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: initialize low and high +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: lo = 1 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: hi = 0 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: inf = 0 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: total = 1 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: simplify +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: new candidates = 0 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: select ranges +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: no more spills +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: allocating REG +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: renumber registers +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: size = 4 +2025-08-07T13:54:37Z INFO 49164 []: find first defs for local reg +2025-08-07T13:54:37Z INFO 49164 []: find first defs for global reg +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: live range analysis +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: find costs +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: simplify interference graph +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: initialize low and high +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: lo = 4 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: hi = 0 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: inf = 0 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: total = 4 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: simplify +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: new candidates = 0 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: select ranges +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: no more spills +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-08-07T13:54:37Z USER 49164 [CoreForkPass]: coloring_allocator_reg finished after 0.088 seconds +2025-08-07T13:54:37Z INFO 49164 [CoreForkPass]: curr_vmrss: 487mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:37Z INFO 49164 [CoreForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 73070 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z USER 49164 [BackendPassManager]: nc_parallel_pass finished after 1.418 seconds +2025-08-07T13:54:37Z INFO 49164 [BackendPassManager]: curr_vmrss: 487mb, ru_maxrss: 885mb (delta=286mb) +2025-08-07T13:54:37Z INFO 49164 [BackendPassManager]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 73070 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z USER 49164 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:37Z INFO 49164 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=4 allocs=15651 blocks=4 instructions=73070 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z USER 49164 [ModuleForkPass]: Running birverifier +2025-08-07T13:54:37Z INFO 49164 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=4 allocs=15651 blocks=4 instructions=73070 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z USER 49164 [ModuleForkPass]: birverifier finished after 0.069 seconds +2025-08-07T13:54:37Z INFO 49164 [ModuleForkPass]: curr_vmrss: 492mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:37Z INFO 49164 [ModuleForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 73070 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z USER 49164 [BackendPassManager]: mod_parallel_pass finished after 0.072 seconds +2025-08-07T13:54:37Z INFO 49164 [BackendPassManager]: curr_vmrss: 492mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:37Z INFO 49164 [BackendPassManager]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 73070 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z USER 49164 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:37Z INFO 49164 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=4 allocs=15651 blocks=4 instructions=73070 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z USER 49164 [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:37Z INFO 49164 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=4 allocs=15651 blocks=4 instructions=73070 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z USER 49164 [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:54:37Z INFO 49164 [SubgraphForkPass]: curr_vmrss: 492mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:37Z INFO 49164 [SubgraphForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 73070 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z USER 49164 [BackendPassManager]: subgraph_parallel_pass finished after 0.002 seconds +2025-08-07T13:54:37Z INFO 49164 [BackendPassManager]: curr_vmrss: 492mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:37Z INFO 49164 [BackendPassManager]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 73070 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z USER 49164 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:37Z INFO 49164 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=4 allocs=15651 blocks=4 instructions=73070 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z USER 49164 [ModuleForkPass]: Running codegen +2025-08-07T13:54:37Z INFO 49164 [ModuleForkPass]: Inputs to codegen: modules=1 functions=4 allocs=15651 blocks=4 instructions=73070 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [Codegen]: Total compiler allocated DRAM tensors: 0.0395622 GB +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 7.62851 │ +│ ExternalOutput │ 0.0703125 │ +│ Const │ 0.000124224 │ +└────────────────┴─────────────┘ + +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [Codegen]: Total runtime managed DRAM tensors: 7.69894 GB +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [Codegen]: Instruction Stats: +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 52427 │ +│ LDWEIGHTS │ 52259 │ +│ ACTIVATE │ 7826 │ +│ UNKNOWN(0xd4) │ 4972 │ +│ EVENT_SEMAPHORE │ 4582 │ +│ PSEUDO_DMA_TRIGGER │ 824 │ +│ TENSOR_TENSOR │ 499 │ +│ MATCH_VALUE_LOAD │ 441 │ +│ TENSOR_SCALAR_ADDR │ 235 │ +│ MAX8 │ 224 │ +│ FIND_INDEX8 │ 224 │ +│ MATCH_REPLACE8 │ 217 │ +│ UNKNOWN(0xd3) │ 185 │ +│ TENSOR_REDUCE │ 136 │ +│ UNKNOWN(0x8b) │ 136 │ +│ UNKNOWN(0x8d) │ 128 │ +│ POOL_BUFFER_LOAD │ 99 │ +│ GATHER │ 99 │ +│ TENSOR_SCALAR │ 98 │ +│ MEMSET │ 77 │ +│ UNKNOWN(0xda) │ 76 │ +│ UNKNOWN(0xe8) │ 67 │ +│ COPY │ 66 │ +│ UNKNOWN(0x92) │ 64 │ +│ UNKNOWN(0x8a) │ 64 │ +│ CAST │ 45 │ +│ ACT_TABLE_LOAD │ 21 │ +│ PSEUDO_BRANCH_LABEL │ 20 │ +│ RECIPROCAL │ 19 │ +│ UNKNOWN(0xd2) │ 15 │ +│ PSEUDO_DMA_REARM │ 10 │ +│ UNKNOWN(0xcf) │ 10 │ +│ UNKNOWN(0xd9) │ 7 │ +│ IOTA │ 6 │ +│ MOVE │ 4 │ +│ LOAD_MASK_SELECT │ 4 │ +│ STREAM_SHUFFLE │ 4 │ +│ ALU_OP │ 2 │ +│ UNKNOWN(0xe5) │ 2 │ +│ RNG │ 1 │ +│ TENSOR_SCALAR │ 1 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [Codegen]: +┌────────────┬────────┐ +│ Engine │ Count │ +├────────────┼────────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 7620 │ +│ Scalar │ 9384 │ +│ Tensor │ 106488 │ +│ SyncDMA │ 0 │ +│ Vector │ 2600 │ +│ Sync │ 125 │ +│ All │ 0 │ +└────────────┴────────┘ + +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [Codegen]: Total instructions: 126217 (0.00752312 GB) +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [Codegen]: Total DynamicDMA instruction count: 4972 +2025-08-07T13:54:37Z USER 49164 (sgLnk) [Codegen]: isa_gen finished after 0.275 seconds +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────────────┼────────────────┤ +│ qActSpillReload0_defId_0 │ 8192 │ +│ qActSpillReload0_defId_1 │ 8960 │ +│ qActSpillReload0_defId_2 │ 8364 │ +│ qDVESpillReload0_defId_2 │ 8 │ +│ qPoolIO0 │ 2 │ +│ qPoolPIOParam0 │ 72 │ +│ qPoolSpillReload0_defId_0 │ 12288 │ +│ qPoolSpillReload0_defId_1 │ 2048 │ +│ qPoolSpillReload0_defId_2 │ 2822 │ +│ qSPIO0 │ 73874 │ +│ qSPSpillReload0_defId_0 │ 514 │ +│ qSPSpillReload0_defId_1 │ 3328 │ +│ qSPSpillReload0_defId_2 │ 3870 │ +└───────────────────────────┴────────────────┘ + +Total descriptors: 124342 (0.00185284 GB) +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qPoolIO0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +│ qPoolPIOParam0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [Codegen]: Tensors with largest descriptor count: +┌────────────────────────────┬──────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├────────────────────────────┼──────────┼──────────┼──────────────────┤ +│ add.9_sg0002 │ Internal │ bfloat16 │ 9 │ +│ dot.7-buffer-1676_sg0001 │ Internal │ bfloat16 │ 16 │ +│ transpose.1_sg0000 │ Internal │ bfloat16 │ 16 │ +│ dot.14-buffer-2762_sg0002 │ Internal │ bfloat16 │ 16 │ +│ dot.11-buffer-1681_sg0001 │ Internal │ bfloat16 │ 16 │ +│ dot.4-buffer-2025_sg0000 │ Internal │ bfloat16 │ 16 │ +│ all-reduce.519.1689_sg0001 │ Internal │ bfloat16 │ 35 │ +│ add.4_sg0001 │ Internal │ bfloat16 │ 43 │ +│ all_gather.1_sg0000 │ Internal │ bfloat16 │ 48 │ +│ convert.59_sg0002 │ Internal │ float32 │ 599 │ +└────────────────────────────┴──────────┴──────────┴──────────────────┘ + +2025-08-07T13:54:37Z USER 49164 (sgLnk) [Codegen]: dma_desc_gen finished after 0.016 seconds +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [Codegen]: Estimated peak DRAM usage: 7.74788 GB +2025-08-07T13:54:37Z INFO 49164 (sgLnk) [Codegen]: Generating debug info +2025-08-07T13:54:37Z WARNING 49164 (sgLnk) [Codegen]: Found 127 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-08-07T13:54:37Z USER 49164 (sgLnk) [Codegen]: debug_info_gen finished after 0.165 seconds +2025-08-07T13:54:37Z USER 49164 [ModuleForkPass]: codegen finished after 0.477 seconds +2025-08-07T13:54:37Z INFO 49164 [ModuleForkPass]: curr_vmrss: 550mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:37Z INFO 49164 [ModuleForkPass]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 73070 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z USER 49164 [BackendPassManager]: mod_parallel_pass finished after 0.480 seconds +2025-08-07T13:54:37Z INFO 49164 [BackendPassManager]: curr_vmrss: 550mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:37Z INFO 49164 [BackendPassManager]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 73070 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z USER 49164 [BackendPassManager]: Running neff_packager +2025-08-07T13:54:37Z INFO 49164 [BackendPassManager]: Inputs to neff_packager: modules=1 functions=4 allocs=15651 blocks=4 instructions=73070 Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z INFO 49164 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.9-1294_CRSM.npy +2025-08-07T13:54:37Z INFO 49164 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_1476_CRSM.npy +2025-08-07T13:54:37Z INFO 49164 [NeffPackager]: FileDeDuper file not found value_sg0000_t2040_CRSM.npy +2025-08-07T13:54:37Z INFO 49164 [NeffPackager]: FileDeDuper file not found value_sg0001_identity_1171_CRSM.npy +2025-08-07T13:54:37Z INFO 49164 [NeffPackager]: FileDeDuper file not found value_sg0001_t1692_CRSM.npy +2025-08-07T13:54:37Z INFO 49164 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.24_CRSM.npy +2025-08-07T13:54:37Z INFO 49164 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.25_CRSM.npy +2025-08-07T13:54:37Z INFO 49164 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.26-814-918_CRSM.npy +2025-08-07T13:54:37Z INFO 49164 [NeffPackager]: FileDeDuper file not found value_sg0002_identity_1054_CRSM.npy +2025-08-07T13:54:37Z INFO 49164 [NeffPackager]: Const File de-dup saved 0 KB of memory footprint +2025-08-07T13:54:37Z WARNING 49164 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.169490.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-08-07T13:54:37Z INFO 49164 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.neff +2025-08-07T13:54:37Z INFO 49164 [NeffFileWriter]: IR signature: eaf53596d9329c376e57c6cded30ea00 for neff artifacts +2025-08-07T13:54:37Z USER 49164 [BackendPassManager]: neff_packager finished after 0.107 seconds +2025-08-07T13:54:37Z INFO 49164 [BackendPassManager]: curr_vmrss: 550mb, ru_maxrss: 885mb (delta=0mb) +2025-08-07T13:54:37Z INFO 49164 [BackendPassManager]: Output has 1 module(s), 4 function(s), 15651 memory location(s), 4 block(s), and 73070 instruction(s). Max writers: 594 Max Readers: 19987 +2025-08-07T13:54:37Z INFO 49164 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ sg00 │ Peak scratchpad usage: local │ 0.013672 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: local │ 0.013672 GB │ +│ nc00 │ sg01 │ Peak scratchpad usage: local │ 0.019531 GB │ +│ nc00 │ sg01 │ Total size of allocated tensors: local │ 0.020752 GB │ +│ nc00 │ sg02 │ Peak scratchpad usage: local │ 0.011734 GB │ +│ nc00 │ sg02 │ Total size of allocated tensors: local │ 0.017548 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.019531 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.039562 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.281876 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.039562 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-08-07T13:54:37Z INFO 49164 [BackendDriver]: Backend completed successfully, tearing down. +2025-08-07T13:54:37Z INFO 47910 [job.WalrusDriver.0]: new_lnkState: {"model": ["/home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "walrus_bir.out.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "cached_wavegraph": "walrus_bir.out.json", "state_dir": "/home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/neuronxcc-_bze9vv6/sgLnk/sg00", "state_id": "sgLnk"} +2025-08-07T13:54:37Z INFO 47910 [job.WalrusDriver.0]: MTBackend: completed successfully. +2025-08-07T13:54:37Z INFO 47910 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-08-07T13:54:37Z INFO 47910 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-08-07T13:54:37Z INFO 47910 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "walrus_bir.out.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "cached_wavegraph": "walrus_bir.out.json", "state_dir": "/home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/neuronxcc-_bze9vv6/sgLnk/sg00", "state_id": "sgLnk"}' --pipeline BIRLinker +2025-08-07T13:54:37Z INFO 47910 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/neuronxcc-_bze9vv6 +2025-08-07T13:54:37Z INFO 47910 [job.BIRLinker.0]: Linking already done. +2025-08-07T13:54:37Z INFO 47910 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-08-07T13:54:37Z INFO 47910 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-08-07T13:54:37Z INFO 47910 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-08-07T13:54:37Z INFO 47910 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-08-07T13:54:37Z INFO 47910 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-08-07T13:54:37Z INFO 47910 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-08-07T13:54:37Z INFO 47910 [job.NeffWrapper.0]: Processing input #0 +2025-08-07T13:54:37Z INFO 47910 [job.NeffWrapper.0]: Start NeffWrapper +2025-08-07T13:54:37Z INFO 47910 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb --neff /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.neff --io_transposes /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/neuronxcc-_bze9vv6/io_transposes.json --output /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/context_encoding_model/_tp0_bk2/neuronxcc-_bze9vv6/hlo_netlist.json +2025-08-07T13:54:38Z INFO 47910 [job.NeffWrapper.0]: There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-08-07T13:54:38Z INFO 47910 [job.NeffWrapper.0]: Job #0 finished +2025-08-07T13:54:38Z INFO 47910 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-08-07T13:54:38Z INFO 47910 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-08-07T13:54:38Z INFO 47910 [pipeline.Pipeline.0]: Job #0 finished +2025-08-07T13:54:38Z INFO 47776 [root]: Subcommand returned with exitcode=0 diff --git a/context_encoding_model/_tp0_bk2/metaneff.pb b/context_encoding_model/_tp0_bk2/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..a8ccdca44af5b91d53d15fa06ae28dc0a551a5ee --- /dev/null +++ b/context_encoding_model/_tp0_bk2/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99c279a1a32451ce56757879c7a74b6ff23378ae19871f2aee2c2746ceda57f3 +size 1373735 diff --git a/context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb b/context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..a028c9ad17a430d1049c3fb2d3be332b6ca84001 --- /dev/null +++ b/context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:970c5138d61d773fc00bacb9090fbc05a05573925b8d91068006c211596d3f78 +size 1450821 diff --git a/context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.neff b/context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.neff new file mode 100644 index 0000000000000000000000000000000000000000..5c4104700a702e1ce4f095214871287974a27fc4 --- /dev/null +++ b/context_encoding_model/_tp0_bk2/model.MODULE_00594b8bc68e927f3dbe+1ad60ced.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7e216fd8f0f2acfef59524e7cdb4ead506b2c17c584ce45dd222cd4dc4e3f4f +size 1987584 diff --git a/context_encoding_model/_tp0_bk2/neuron_config.json b/context_encoding_model/_tp0_bk2/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..493c222f7a058f2a58eb8a368f6a337b8d719de4 --- /dev/null +++ b/context_encoding_model/_tp0_bk2/neuron_config.json @@ -0,0 +1,220 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "Qwen/Qwen3-8B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 12288, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 36, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 1, + "bucket_n_active_tokens": true, + "buckets": [ + 512 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 2, + "chunked_prefill_config": null, + "context_encoding_buckets": [ + 512 + ], + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": true, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 1, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": false, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 1, + "lora_config": null, + "max_batch_size": 1, + "max_context_length": 1024, + "max_length": 1024, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 1024, + "n_positions": 1024, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 1024, + "pa_num_blocks": 1, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 1024, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 1, + "speculation_length": 0, + "start_rank_id": 0, + "target": null, + "tile_cc": false, + "tkg_batch_size": 1, + "token_generation_buckets": null, + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 32, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/context_encoding_model/_tp0_bk3/command.txt b/context_encoding_model/_tp0_bk3/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..f268b998e32fdf1d5530fe39610887da1f3677aa --- /dev/null +++ b/context_encoding_model/_tp0_bk3/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb --output model.MODULE_b3ddbc97e5f0d1d64c82+155de413.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk3/compile_flags.MODULE_b3ddbc97e5f0d1d64c82+155de413.json b/context_encoding_model/_tp0_bk3/compile_flags.MODULE_b3ddbc97e5f0d1d64c82+155de413.json new file mode 100644 index 0000000000000000000000000000000000000000..3aa3759093f66093432295637d179954fcbfff30 --- /dev/null +++ b/context_encoding_model/_tp0_bk3/compile_flags.MODULE_b3ddbc97e5f0d1d64c82+155de413.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=1", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/log-neuron-cc.txt"] \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk3/global_metric_store.json b/context_encoding_model/_tp0_bk3/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..f9a76349adb0c37f9e8ac979610c2f74a719c473 --- /dev/null +++ b/context_encoding_model/_tp0_bk3/global_metric_store.json @@ -0,0 +1,1079 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 99.7004623413086, + "StaticProfiler::AveragePartitionUtilization": 97.94140625, + "StaticProfiler::AveragePeUtilization": 98.78884887695313, + "StaticProfiler::LocalizationEfficiency": 91.59693145751953, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 95.863037109375, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1.0, + "StaticProfiler::AveragePartitionUtilization": 1.0, + "StaticProfiler::AveragePeUtilization": 1.0, + "StaticProfiler::LocalizationEfficiency": 1.0, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 0.01837611198425293, + "AffinePredicateResolution": 0.0011184215545654297, + "AliasDependencyElimination": 0.00015664100646972656, + "AliasDependencyInduction": 0.005170583724975586, + "AliasDependencyReset": 0.027508020401000977, + "BFComputeCutting": 0.0036101341247558594, + "BirCodeGenLoop": 0.4774467945098877, + "CCOpFusion": 0.033265113830566406, + "CanonicalizeConv": 2.300000051036477e-05, + "CanonicalizeDAGForPGTiling": 0.004282712936401367, + "CanonicalizeForTensorizer": 4.600000102072954e-05, + "CanonicalizeIR": 0.0024569034576416016, + "Canonicalizer": 0.0009039999567903578, + "CoalesceCCOp": 0.014229059219360352, + "CommuteConcat": 0.0017316341400146484, + "DMALocalityOpt": 0.005630016326904297, + "DMAProfiler": 0.012981653213500977, + "DMATilingProfiler": 0.0037560462951660156, + "DataLocalityOpt": 0.07645320892333984, + "DataStreaming": 0.03730320930480957, + "DeConcat": 0.0018520355224609375, + "DeadCodeElimination": 0.0020148754119873047, + "DeadStoreElimination": 0.006912708282470703, + "DelinearIndices": 0.004647254943847656, + "Delinearization": 0.003908872604370117, + "DoNothing": 0.0001888275146484375, + "DramToDramTranspose": 0.02015542984008789, + "DumpGraphAndMetadata": 0.08691883087158203, + "EliminateDivs": 0.0025060176849365234, + "ExpandBatchNorm": 0.0027189254760742188, + "ExpandISAMacro": 0.011646032333374023, + "FactorizeBlkDims": 0.010123252868652344, + "FactorizeThreadAxesInFreeDims": 0.0023202896118164063, + "FlattenMacroLoop": 0.00232696533203125, + "GenericAccessSimplifier": 0.0008094310760498047, + "HoistCompute": 5.999999757477781e-06, + "IdentifyCrossPassTensors": 5.2999999752501026e-05, + "InferInitValue": 0.02833867073059082, + "InferIntrinsicOnCC": 0.008923768997192383, + "InferNeuronTensor": 0.025766372680664063, + "InferNonlocalTensors": 0.014599800109863281, + "InferPSumTensor": 0.28418898582458496, + "InlineNativeKernels": 0.00860905647277832, + "InsertIOTransposes": 0.01989889144897461, + "InsertLocalTransposes": 0.004229307174682617, + "InsertOffloadedTransposes": 0.0029871463775634766, + "LICM": 0.0030870437622070313, + "LateLegalizeInst": 0.014106035232543945, + "LateLegalizePostSplit": 0.014872312545776367, + "LateLowerReshapeOp": 0.0010464191436767578, + "LateLowerTensorOp": 0.002707242965698242, + "LateNeuronInstComb": 0.010563373565673828, + "LayoutPreprocessing": 0.026853561401367188, + "LayoutPreprocessingAndAnalysis": 0.0556035041809082, + "LayoutRequirementAnalysis": 0.004946470260620117, + "LegalizeCCOpLayout": 0.0025353431701660156, + "LegalizeOpLevelAlias": 0.0018966197967529297, + "LegalizePartitionReduce": 0.0017490386962890625, + "LegalizeSundaAccess": 0.07800722122192383, + "LegalizeSundaMacro": 0.012125253677368164, + "LegalizeType": 0.012685060501098633, + "LocalLayoutOpt": 0.013860225677490234, + "LoopFusion": 0.005201578140258789, + "LoopSplitting": 0.0003204345703125, + "LowerBroadcast": 0.002086162567138672, + "LowerCCOpBlockAxis": 0.0040171146392822266, + "LowerComplexBroadcast": 0.002280712127685547, + "LowerIntrinsics": 0.3143951892852783, + "LowerTensorOp": 0.01141357421875, + "LowerTranspose": 0.012923002243041992, + "MacroGeneration": 0.034410953521728516, + "MaskPropagation": 0.0028192996978759766, + "MemcastMotion": 1.8000000636675395e-05, + "MemcpyElimination": 0.02788853645324707, + "MutateDataType": 0.0012311935424804688, + "NeuronAliasDependencyInduction": 0.0001773834228515625, + "NeuronAliasDependencyReset": 0.024976015090942383, + "NeuronInstComb": 0.005156517028808594, + "NeuronLICM": 0.036696434020996094, + "NeuronLoopFusion": 0.008457422256469727, + "NeuronLoopInterchange": 0.001413106918334961, + "NeuronSimplifier": 0.007856369018554688, + "NeuronSimplifyPredicates": 0.12235808372497559, + "NeuronValueNumbering": 0.004765748977661133, + "OptimizeAliasedCopyChain": 0.0006341934204101563, + "OptimizeNKIKernels": 0.38834357261657715, + "PAGLayoutOpt": 0.0889735221862793, + "PComputeCutting": 0.005109071731567383, + "PGLayoutTilingPipeline": 0.6248171329498291, + "PGTiling": 0.1645822525024414, + "PadElimination": 0.0003485679626464844, + "ParAxesAnnotation": 0.05196070671081543, + "PartialLoopFusion": 0.011112451553344727, + "PartialSimdFusion": 0.012138128280639648, + "PenguinizeFunctions": 4.3000000005122274e-05, + "PerfectLoopNest": 0.002288341522216797, + "PruneFunctions": 4.099999932805076e-05, + "RecognizeOpIdiom": 0.0041277408599853516, + "Recompute": 0.00026416778564453125, + "RelaxPredicates": 0.01356959342956543, + "Rematerialization": 0.0024864673614501953, + "RemoveOptimizationBarriers": 4.900000203633681e-05, + "ReshapeWeights": 0.0007522106170654297, + "ResolveAccessConflict": 0.0048482418060302734, + "ResolveComplicatePredicates": 0.0015094280242919922, + "RewriteReplicationMatmul": 0.0015668869018554688, + "RewriteWeights": 0.0027174949645996094, + "SFKVectorizer": 0.2781519889831543, + "ScatterMotion": 4.70000013592653e-05, + "SimpleAllReduceTiling": 0.009549379348754883, + "Simplifier": 0.003630399703979492, + "SimplifyMacroPredicates": 0.011396646499633789, + "SimplifyNeuronTensor": 1.0561063289642334, + "SimplifySlice": 0.0023348331451416016, + "SimplifyTensor": 0.005601167678833008, + "SpillPSum": 0.013618230819702148, + "SplitAPUnionSets": 0.11336159706115723, + "SplitAccGrp": 0.001394510269165039, + "StaticProfiler": 0.014252662658691406, + "StaticTransposeLocalTensor": 0.003930330276489258, + "SundaISel": 0.04436635971069336, + "TCTransform": 0.0008757114410400391, + "TensorInitialization": 0.01558232307434082, + "TensorOpSimplifier": 0.004608869552612305, + "TensorOpTransform": 0.01923346519470215, + "TensorizerLegalizationPass": 5.2999999752501026e-05, + "TileCCOps": 0.005507707595825195, + "TilingProfiler": 0.007405757904052734, + "TransformConvOp": 0.0030219554901123047, + "TritiumFusion": 0.05425119400024414, + "ValueNumbering": 0.0020017623901367188, + "VectorizeDMA": 0.002228975296020508, + "VectorizeMatMult": 0.006806135177612305, + "VerifySupportedOps": 3.5000000934815034e-05, + "WeightCoalescing": 0.008660554885864258, + "ZeroSizeTensorElimination": 0.00014281272888183594, + "algsimp": 0.0027209999971091747, + "batchnorm_expander": 4.099999932805076e-05, + "boundary-marker-removal": 1.2999998943996616e-05, + "call-inliner": 0.0004540000227279961, + "canonicalize-boundary-marker": 1.700000029813964e-05, + "collective-stream-id-checker": 8.000000525498763e-05, + "comparison-expander": 0.0005869999877177179, + "computation-deduplicator": 7.500000356230885e-05, + "conditional-to-select": 1.700000029813964e-05, + "config-lowering": 8.800000068731606e-05, + "constant-statistics": 0.0005440000095404685, + "constant_folding": 0.00032700004521757364, + "cse": 3.7000001611886546e-05, + "dce": 9.100000170292333e-05, + "dot_decomposer": 0.0013370000524446368, + "dynamic-slice-transpose": 1.2000000424450263e-05, + "eliminate-redundant-compare": 0.0003020000003743917, + "emit-offloaded-dropout": 3.9999998989515007e-05, + "flatten-call-graph": 0.0009239999344572425, + "fuse-send-recv": 7.79999973019585e-05, + "hilo::LegalizeAlias": 1.1999999514955562e-05, + "hilo::NeuronInstCombine": 0.00018899999849963933, + "hilo::NeuronOpFusion": 4.5000000682193786e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 5.7999997807201e-05, + "hilo::ScheduleFusion": 0.00016099998902063817, + "hilo::SixtyFourHack": 6.70000008540228e-05, + "hilo::VerifyAliasing": 4.999999873689376e-06, + "hlo-mac-count": 0.0013409999664872885, + "hlo-verifier": 0.007716999854892492, + "instruction-histogram": 0.0007719999994151294, + "io-con-pipe-begin": 4.999999873689376e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.00139999995008111, + "io-statistics": 6.299999949987978e-05, + "legalize-ccops": 3.999999989900971e-06, + "legalize-compare": 1.1000000085914508e-05, + "lower-argminmax-custom-call": 1.1000000085914508e-05, + "map-inline": 0.0008809999562799931, + "metadata-naming": 6.70000008540228e-05, + "mlir::detail::OpToOpPassAdaptor": 0.00020599999697878957, + "mlir::hlo::MhloToPyPenguin": 0.00291300006210804, + "mlir::mhlo::LowerComplexExtraPass": 0.00027200000477023423, + "mlir::mhlo::LowerComplexPass": 0.0003980000037699938, + "native-to-custom-softmax": 0.0007730000070296228, + "native-to-custom-softmax-dx": 0.0006189999985508621, + "operand_upcaster": 6.299999949987978e-05, + "opt-barrier-removal": 0.0005789999850094318, + "post-par-pipe-begin": 7.999999979801942e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0017419999931007624, + "pre-par-pipe-begin": 1.9999999949504854e-06, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.1384889930486679, + "replace-minimum-constant": 0.0004579999949783087, + "reshape-mover": 0.00011000000085914508, + "simplify-concat": 0.00014099999680183828, + "simplify-while-loops": 9.40000027185306e-05, + "transform-variadic-reduce": 8.100000559352338e-05, + "tuple-simplifier": 0.00030600003083236516, + "unpack-nested-aws-ntwsr": 0.000438000017311424, + "unroll-while-loop": 1.8999999156221747e-05, + "zero_sized_hlo_elimination": 0.0008750000270083547 + }, + "hilo": { + "ConstantSize": 2368805.0, + "HloInputCount": 475.0, + "HloMacCount": 206469595136.0, + "HloOutputCount": 73.0, + "IfmapSize": 8266549248.0, + "OfmapSize": 75497472.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 1751252352.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 49538.0, + "StaticProfiler::AifUb": 304.240234375, + "StaticProfiler::ArithmeticIntensityTensorizer": 278.67474365234375, + "StaticProfiler::AverageDmaLength": 1974.1033935546875, + "StaticProfiler::DDRTransferBytes": 862646080.0, + "StaticProfiler::InternalTransferBytes": 669456896.0, + "StaticProfiler::LoadExpanded": 390679.0, + "StaticProfiler::StoreExpanded": 7261.0, + "StaticProfiler::TotalDMAExpanded": 397940.0, + "StaticProfiler::TotalDynamicInstancesCount": 59578.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 59132.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 28224.0, + "TilingProfiler::NumPfTransposes": 5.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 19777.0, + "TilingProfiler::PfTransposeInstructionsForIo": 19008.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 768.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 6.0, + "TilingProfiler::SimdInstructionsAfterTiling": 303.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "all": { + "compiletime": { + "algsimp": 0.002532999962568283, + "call-inliner": 0.00042600001324899495, + "collective-stream-id-checker": 6.70000008540228e-05, + "comparison-expander": 0.0005719999899156392, + "constant-statistics": 0.0005440000095404685, + "constant_folding": 0.0003000000142492354, + "dce": 8.800000068731606e-05, + "dot_decomposer": 0.0013370000524446368, + "eliminate-redundant-compare": 0.000291000003926456, + "flatten-call-graph": 0.0008929999894462526, + "hlo-mac-count": 0.0010870000114664435, + "hlo-verifier": 0.007048000115901232, + "instruction-histogram": 0.0007719999994151294, + "io-con-pipe-begin": 4.999999873689376e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.00139999995008111, + "io-statistics": 6.299999949987978e-05, + "map-inline": 0.0008459999808110297, + "native-to-custom-softmax": 0.0006709999870508909, + "native-to-custom-softmax-dx": 0.0005300000193528831, + "opt-barrier-removal": 0.0005789999850094318, + "pre-par-pipe-begin": 1.9999999949504854e-06, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.1384889930486679, + "replace-minimum-constant": 0.00041700000292621553, + "reshape-mover": 9.999999747378752e-05, + "simplify-while-loops": 8.800000068731606e-05, + "tuple-simplifier": 0.000291000003926456, + "unpack-nested-aws-ntwsr": 0.00042600001324899495, + "unroll-while-loop": 1.8999999156221747e-05, + "zero_sized_hlo_elimination": 0.0008750000270083547 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.00023508071899414063, + "DMALocalityOpt": 0.00017404556274414063, + "DMAProfiler": 0.0008785724639892578, + "DataStreaming": 0.0002880096435546875, + "DoNothing": 0.00011467933654785156, + "ExpandISAMacro": 0.0006787776947021484, + "FactorizeBlkDims": 0.0004444122314453125, + "InferPSumTensor": 0.0004467964172363281, + "LateLegalizeInst": 0.000461578369140625, + "LateNeuronInstComb": 0.0004818439483642578, + "LegalizeSundaAccess": 0.0016222000122070313, + "LegalizeType": 0.0002703666687011719, + "LowerBroadcast": 0.00025391578674316406, + "LowerIntrinsics": 0.00021457672119140625, + "LowerTranspose": 0.00024318695068359375, + "NeuronInstComb": 0.00048065185546875, + "NeuronLICM": 0.00038552284240722656, + "NeuronSimplifyPredicates": 0.0027823448181152344, + "NeuronValueNumbering": 0.00043129920959472656, + "SFKVectorizer": 0.003134012222290039, + "SimpleAllReduceTiling": 0.00022721290588378906, + "SimplifyNeuronTensor": 0.0005092620849609375, + "SpillPSum": 0.0005443096160888672, + "WeightCoalescing": 0.00020051002502441406 + } + }, + "sg00": { + "compiletime": { + "CanonicalizeConv": 9.999999974752427e-07, + "CanonicalizeForTensorizer": 1.700000029813964e-05, + "Canonicalizer": 0.00033599999733269215, + "HoistCompute": 3.000000106112566e-06, + "IdentifyCrossPassTensors": 1.5999999959603883e-05, + "MemcastMotion": 1.1000000085914508e-05, + "PenguinizeFunctions": 1.8000000636675395e-05, + "PruneFunctions": 1.4000000192027073e-05, + "RemoveOptimizationBarriers": 1.2999999853491317e-05, + "ScatterMotion": 2.4000000848900527e-05, + "TensorizerLegalizationPass": 2.700000004551839e-05, + "VerifySupportedOps": 1.2000000424450263e-05, + "algsimp": 6.500000017695129e-05, + "batchnorm_expander": 1.4000000192027073e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 9.000000318337698e-06, + "canonicalize-boundary-marker": 6.000000212225132e-06, + "collective-stream-id-checker": 3.999999989900971e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 2.300000051036477e-05, + "conditional-to-select": 4.999999873689376e-06, + "config-lowering": 3.9999998989515007e-05, + "constant_folding": 9.000000318337698e-06, + "cse": 1.2999999853491317e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.999999989900971e-06, + "emit-offloaded-dropout": 1.4000000192027073e-05, + "flatten-call-graph": 9.999999747378752e-06, + "fuse-send-recv": 2.8000000384054147e-05, + "hilo::LegalizeAlias": 4.999999873689376e-06, + "hilo::NeuronInstCombine": 8.499999967170879e-05, + "hilo::NeuronOpFusion": 2.700000004551839e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 1.4999999621068127e-05, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 1.2999999853491317e-05, + "hilo::VerifyAliasing": 1.9999999949504854e-06, + "hlo-mac-count": 3.099999958067201e-05, + "hlo-verifier": 0.0002530000056140125, + "legalize-ccops": 9.999999974752427e-07, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.999999989900971e-06, + "map-inline": 1.2000000424450263e-05, + "metadata-naming": 2.4000000848900527e-05, + "mlir::detail::OpToOpPassAdaptor": 2.2000000171829015e-05, + "mlir::hlo::MhloToPyPenguin": 0.0010389999952167273, + "mlir::mhlo::LowerComplexExtraPass": 8.800000068731606e-05, + "mlir::mhlo::LowerComplexPass": 0.00014200000441633165, + "native-to-custom-softmax": 9.000000136438757e-05, + "native-to-custom-softmax-dx": 4.3000000005122274e-05, + "operand_upcaster": 2.300000051036477e-05, + "post-par-pipe-begin": 3.000000106112566e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0006249999860301614, + "replace-minimum-constant": 2.5999999706982635e-05, + "reshape-mover": 3.999999989900971e-06, + "simplify-concat": 4.8000001697801054e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 9.000000318337698e-06, + "tuple-simplifier": 4.999999873689376e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 73.02900695800781, + "ConstantSize": 2368805.0, + "HloInputCount": 475.0, + "HloMacCount": 25769803776.0, + "HloOutputCount": 73.0, + "IfmapSize": 8266549248.0, + "OfmapSize": 75497472.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 705741632.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 0.0818486213684082, + "AffinePredicateResolution": 0.001665353775024414, + "AliasDependencyElimination": 0.00012683868408203125, + "AliasDependencyInduction": 0.008559942245483398, + "AliasDependencyReset": 0.03254294395446777, + "BFComputeCutting": 0.003969907760620117, + "BirCodeGenLoop": 0.06339025497436523, + "CCOpFusion": 0.029911041259765625, + "CanonicalizeDAGForPGTiling": 0.003092050552368164, + "CanonicalizeIR": 0.002637147903442383, + "CoalesceCCOp": 0.0051479339599609375, + "CommuteConcat": 0.001478433609008789, + "DMALocalityOpt": 0.0016412734985351563, + "DMAProfiler": 0.004613637924194336, + "DMATilingProfiler": 0.004850864410400391, + "DataLocalityOpt": 0.11357831954956055, + "DataStreaming": 0.0061092376708984375, + "DeConcat": 0.0013332366943359375, + "DeadCodeElimination": 0.0018727779388427734, + "DeadStoreElimination": 0.03094482421875, + "DelinearIndices": 0.008640289306640625, + "Delinearization": 0.0035429000854492188, + "DoNothing": 8.106231689453125e-05, + "DramToDramTranspose": 0.03549051284790039, + "DumpGraphAndMetadata": 0.005577564239501953, + "EliminateDivs": 0.003966331481933594, + "ExpandBatchNorm": 0.0017447471618652344, + "ExpandISAMacro": 0.002687692642211914, + "FactorizeBlkDims": 0.026469945907592773, + "FactorizeThreadAxesInFreeDims": 0.0014863014221191406, + "FlattenMacroLoop": 0.00392913818359375, + "GenericAccessSimplifier": 0.0018973350524902344, + "InferInitValue": 0.03517007827758789, + "InferIntrinsicOnCC": 0.010237932205200195, + "InferNeuronTensor": 0.051462411880493164, + "InferNonlocalTensors": 0.14991235733032227, + "InferPSumTensor": 0.053685903549194336, + "InlineNativeKernels": 0.002433300018310547, + "InsertIOTransposes": 0.015550613403320313, + "InsertLocalTransposes": 0.007843017578125, + "InsertOffloadedTransposes": 0.002854585647583008, + "LICM": 0.003381490707397461, + "LateLegalizeInst": 0.0069310665130615234, + "LateLegalizePostSplit": 0.00308990478515625, + "LateLowerReshapeOp": 0.0017940998077392578, + "LateLowerTensorOp": 0.005001068115234375, + "LateNeuronInstComb": 0.016704320907592773, + "LayoutPreprocessing": 0.033296823501586914, + "LayoutPreprocessingAndAnalysis": 0.12302517890930176, + "LayoutRequirementAnalysis": 0.007364988327026367, + "LegalizeCCOpLayout": 0.0029296875, + "LegalizeOpLevelAlias": 0.0016987323760986328, + "LegalizePartitionReduce": 0.0014727115631103516, + "LegalizeSundaAccess": 0.04025077819824219, + "LegalizeSundaMacro": 0.009906291961669922, + "LegalizeType": 0.004493236541748047, + "LocalLayoutOpt": 0.017308473587036133, + "LoopFusion": 0.005831241607666016, + "LoopSplitting": 0.00037789344787597656, + "LowerBroadcast": 0.0016851425170898438, + "LowerCCOpBlockAxis": 0.005655765533447266, + "LowerComplexBroadcast": 0.0020987987518310547, + "LowerIntrinsics": 0.040236473083496094, + "LowerTensorOp": 0.012641191482543945, + "LowerTranspose": 0.0125579833984375, + "MacroGeneration": 0.08074021339416504, + "MaskPropagation": 0.005038022994995117, + "MemcpyElimination": 0.10875082015991211, + "MutateDataType": 0.0013315677642822266, + "NeuronAliasDependencyInduction": 0.00025200843811035156, + "NeuronAliasDependencyReset": 0.021958112716674805, + "NeuronInstComb": 0.009703636169433594, + "NeuronLICM": 0.011526823043823242, + "NeuronLoopFusion": 0.017663955688476563, + "NeuronLoopInterchange": 0.002567291259765625, + "NeuronSimplifier": 0.011670589447021484, + "NeuronSimplifyPredicates": 0.017385244369506836, + "NeuronValueNumbering": 0.004181623458862305, + "OptimizeAliasedCopyChain": 0.0017867088317871094, + "OptimizeNKIKernels": 0.0020456314086914063, + "PAGLayoutOpt": 0.3681519031524658, + "PComputeCutting": 0.008620262145996094, + "PGLayoutTilingPipeline": 1.3210320472717285, + "PGTiling": 0.27039527893066406, + "PadElimination": 0.0003745555877685547, + "ParAxesAnnotation": 0.33005595207214355, + "PartialLoopFusion": 0.026912212371826172, + "PartialSimdFusion": 0.03544425964355469, + "PerfectLoopNest": 0.0021703243255615234, + "RecognizeOpIdiom": 0.004334926605224609, + "Recompute": 0.0002522468566894531, + "RelaxPredicates": 0.004270076751708984, + "Rematerialization": 0.005487918853759766, + "ReshapeWeights": 0.0006825923919677734, + "ResolveAccessConflict": 0.003779888153076172, + "ResolveComplicatePredicates": 0.0018131732940673828, + "RewriteReplicationMatmul": 0.002633333206176758, + "RewriteWeights": 0.0036499500274658203, + "SFKVectorizer": 0.2772994041442871, + "SimpleAllReduceTiling": 0.002454519271850586, + "Simplifier": 0.0045070648193359375, + "SimplifyMacroPredicates": 0.016190290451049805, + "SimplifyNeuronTensor": 0.01452183723449707, + "SimplifySlice": 0.0010039806365966797, + "SimplifyTensor": 0.00657200813293457, + "SpillPSum": 0.02208685874938965, + "SplitAPUnionSets": 0.04095458984375, + "SplitAccGrp": 0.0018160343170166016, + "StaticProfiler": 0.004816770553588867, + "StaticTransposeLocalTensor": 0.004886150360107422, + "SundaISel": 0.04611611366271973, + "TCTransform": 0.001667022705078125, + "TensorInitialization": 0.022374629974365234, + "TensorOpSimplifier": 0.006697177886962891, + "TensorOpTransform": 0.02793574333190918, + "TileCCOps": 0.007641792297363281, + "TilingProfiler": 0.015750885009765625, + "TransformConvOp": 0.0026845932006835938, + "TritiumFusion": 0.08186149597167969, + "ValueNumbering": 0.0026755332946777344, + "VectorizeDMA": 0.007223367691040039, + "VectorizeMatMult": 0.018305540084838867, + "WeightCoalescing": 0.003328561782836914, + "ZeroSizeTensorElimination": 0.00011229515075683594 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 5862.0, + "StaticProfiler::AifUb": 88.59026336669922, + "StaticProfiler::ArithmeticIntensityTensorizer": 582.7418823242188, + "StaticProfiler::AverageDmaLength": 2248.2685546875, + "StaticProfiler::AverageFractalPeUtilization": 99.96076202392578, + "StaticProfiler::AveragePartitionUtilization": 99.90216827392578, + "StaticProfiler::AveragePeUtilization": 99.8394546508789, + "StaticProfiler::DDRTransferBytes": 104424704.0, + "StaticProfiler::InternalTransferBytes": 122421248.0, + "StaticProfiler::LoadExpanded": 25346.0, + "StaticProfiler::LocalizationEfficiency": 657.7944946289063, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 915.0787353515625, + "StaticProfiler::StoreExpanded": 10753.0, + "StaticProfiler::TotalDMAExpanded": 36099.0, + "StaticProfiler::TotalDynamicInstancesCount": 8866.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 8860.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 96.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 3080.0, + "TilingProfiler::NumPfTransposes": 8.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 6.0, + "TilingProfiler::NumPfTransposesForNonlocal": 1.0, + "TilingProfiler::PfTransposeInstructions": 1760.0, + "TilingProfiler::PfTransposeInstructionsForIo": 256.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1376.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 128.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 649.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0001": { + "compiletime": { + "AGOrderingAnalysisPass": 0.03383040428161621, + "AffinePredicateResolution": 0.0015320777893066406, + "AliasDependencyElimination": 0.0001316070556640625, + "AliasDependencyInduction": 0.00819253921508789, + "AliasDependencyReset": 0.02862405776977539, + "BFComputeCutting": 0.004217624664306641, + "BirCodeGenLoop": 0.0443270206451416, + "CCOpFusion": 0.04336118698120117, + "CanonicalizeDAGForPGTiling": 0.0031616687774658203, + "CanonicalizeIR": 0.0021500587463378906, + "CoalesceCCOp": 0.005389690399169922, + "CommuteConcat": 0.0024237632751464844, + "DMALocalityOpt": 0.002274751663208008, + "DMAProfiler": 0.003973484039306641, + "DMATilingProfiler": 0.005924701690673828, + "DataLocalityOpt": 0.15027260780334473, + "DataStreaming": 0.004762887954711914, + "DeConcat": 0.0018739700317382813, + "DeadCodeElimination": 0.001882314682006836, + "DeadStoreElimination": 0.03486776351928711, + "DelinearIndices": 0.009628534317016602, + "Delinearization": 0.0037381649017333984, + "DoNothing": 6.985664367675781e-05, + "DramToDramTranspose": 0.04212188720703125, + "DumpGraphAndMetadata": 0.004312038421630859, + "EliminateDivs": 0.005432844161987305, + "ExpandBatchNorm": 0.002119302749633789, + "ExpandISAMacro": 0.0024309158325195313, + "FactorizeBlkDims": 0.02235579490661621, + "FactorizeThreadAxesInFreeDims": 0.0018169879913330078, + "FlattenMacroLoop": 0.0030968189239501953, + "GenericAccessSimplifier": 0.0016777515411376953, + "InferInitValue": 0.043079376220703125, + "InferIntrinsicOnCC": 0.009890556335449219, + "InferNeuronTensor": 0.05600404739379883, + "InferNonlocalTensors": 0.03101515769958496, + "InferPSumTensor": 0.04645681381225586, + "InlineNativeKernels": 0.0015399456024169922, + "InsertIOTransposes": 0.02417731285095215, + "InsertLocalTransposes": 0.0070497989654541016, + "InsertOffloadedTransposes": 0.003525972366333008, + "LICM": 0.0035805702209472656, + "LateLegalizeInst": 0.0041539669036865234, + "LateLegalizePostSplit": 0.0027403831481933594, + "LateLowerReshapeOp": 0.0014560222625732422, + "LateLowerTensorOp": 0.004617452621459961, + "LateNeuronInstComb": 0.015344619750976563, + "LayoutPreprocessing": 0.030884981155395508, + "LayoutPreprocessingAndAnalysis": 0.06435275077819824, + "LayoutRequirementAnalysis": 0.007463693618774414, + "LegalizeCCOpLayout": 0.002064943313598633, + "LegalizeOpLevelAlias": 0.0011925697326660156, + "LegalizePartitionReduce": 0.0026116371154785156, + "LegalizeSundaAccess": 0.015822887420654297, + "LegalizeSundaMacro": 0.012560844421386719, + "LegalizeType": 0.004744291305541992, + "LocalLayoutOpt": 0.023772239685058594, + "LoopFusion": 0.0066835880279541016, + "LoopSplitting": 0.0003638267517089844, + "LowerBroadcast": 0.002238750457763672, + "LowerCCOpBlockAxis": 0.005678653717041016, + "LowerComplexBroadcast": 0.0019271373748779297, + "LowerIntrinsics": 0.042801856994628906, + "LowerTensorOp": 0.012106895446777344, + "LowerTranspose": 0.012960433959960938, + "MacroGeneration": 0.12800955772399902, + "MaskPropagation": 0.0031516551971435547, + "MemcpyElimination": 0.10379505157470703, + "MutateDataType": 0.0014393329620361328, + "NeuronAliasDependencyInduction": 0.00022101402282714844, + "NeuronAliasDependencyReset": 0.020102262496948242, + "NeuronInstComb": 0.009283781051635742, + "NeuronLICM": 0.009867429733276367, + "NeuronLoopFusion": 0.022713661193847656, + "NeuronLoopInterchange": 0.002709627151489258, + "NeuronSimplifier": 0.01328134536743164, + "NeuronSimplifyPredicates": 0.001683950424194336, + "NeuronValueNumbering": 0.0033235549926757813, + "OptimizeAliasedCopyChain": 0.0007724761962890625, + "OptimizeNKIKernels": 0.001729726791381836, + "PAGLayoutOpt": 0.13172507286071777, + "PComputeCutting": 0.007474422454833984, + "PGLayoutTilingPipeline": 0.9329550266265869, + "PGTiling": 0.4518747329711914, + "PadElimination": 0.00040411949157714844, + "ParAxesAnnotation": 0.0915369987487793, + "PartialLoopFusion": 0.020573854446411133, + "PartialSimdFusion": 0.04284977912902832, + "PerfectLoopNest": 0.002377033233642578, + "RecognizeOpIdiom": 0.0049991607666015625, + "Recompute": 0.00026345252990722656, + "RelaxPredicates": 0.0034220218658447266, + "Rematerialization": 0.0021615028381347656, + "ReshapeWeights": 0.0007557868957519531, + "ResolveAccessConflict": 0.004181861877441406, + "ResolveComplicatePredicates": 0.0015151500701904297, + "RewriteReplicationMatmul": 0.0020759105682373047, + "RewriteWeights": 0.0036649703979492188, + "SFKVectorizer": 0.20148277282714844, + "SimpleAllReduceTiling": 0.003732442855834961, + "Simplifier": 0.004697084426879883, + "SimplifyMacroPredicates": 0.007361888885498047, + "SimplifyNeuronTensor": 0.009825944900512695, + "SimplifySlice": 0.0017888545989990234, + "SimplifyTensor": 0.006832122802734375, + "SpillPSum": 0.022799968719482422, + "SplitAPUnionSets": 0.020108938217163086, + "SplitAccGrp": 0.0015766620635986328, + "StaticProfiler": 0.004146099090576172, + "StaticTransposeLocalTensor": 0.004926919937133789, + "SundaISel": 0.04472494125366211, + "TCTransform": 0.0018138885498046875, + "TensorInitialization": 0.004791736602783203, + "TensorOpSimplifier": 0.0064849853515625, + "TensorOpTransform": 0.0333099365234375, + "TileCCOps": 0.0056035518646240234, + "TilingProfiler": 0.01600933074951172, + "TransformConvOp": 0.002446413040161133, + "TritiumFusion": 0.1239166259765625, + "ValueNumbering": 0.0030901432037353516, + "VectorizeDMA": 0.0017311573028564453, + "VectorizeMatMult": 0.018932580947875977, + "WeightCoalescing": 0.0027513504028320313, + "ZeroSizeTensorElimination": 0.00011587142944335938 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 15811.0, + "StaticProfiler::AifUb": 934.4357299804688, + "StaticProfiler::ArithmeticIntensityTensorizer": 708.8487548828125, + "StaticProfiler::AverageDmaLength": 1109.3380126953125, + "StaticProfiler::AverageFractalPeUtilization": 100.0, + "StaticProfiler::AveragePartitionUtilization": 99.8372802734375, + "StaticProfiler::AveragePeUtilization": 100.0, + "StaticProfiler::DDRTransferBytes": 306283520.0, + "StaticProfiler::InternalTransferBytes": 104595456.0, + "StaticProfiler::LoadExpanded": 257536.0, + "StaticProfiler::LocalizationEfficiency": 75.85848236083984, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 85.1915054321289, + "StaticProfiler::StoreExpanded": 10241.0, + "StaticProfiler::TotalDMAExpanded": 267777.0, + "StaticProfiler::TotalDynamicInstancesCount": 19667.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 19667.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 64.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 12288.0, + "TilingProfiler::NumPfTransposes": 9.0, + "TilingProfiler::NumPfTransposesForIo": 3.0, + "TilingProfiler::NumPfTransposesForLocal": 4.0, + "TilingProfiler::NumPfTransposesForNonlocal": 2.0, + "TilingProfiler::PfTransposeInstructions": 1904.0, + "TilingProfiler::PfTransposeInstructionsForIo": 272.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1120.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 512.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 704.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0002": { + "compiletime": { + "AGOrderingAnalysisPass": 0.01837611198425293, + "AffinePredicateResolution": 0.0011184215545654297, + "AliasDependencyElimination": 0.00015664100646972656, + "AliasDependencyInduction": 0.005170583724975586, + "AliasDependencyReset": 0.027508020401000977, + "BFComputeCutting": 0.0036101341247558594, + "BirCodeGenLoop": 0.4774467945098877, + "CCOpFusion": 0.033265113830566406, + "CanonicalizeDAGForPGTiling": 0.004282712936401367, + "CanonicalizeIR": 0.0024569034576416016, + "CoalesceCCOp": 0.013993978500366211, + "CommuteConcat": 0.0017316341400146484, + "DMALocalityOpt": 0.005455970764160156, + "DMAProfiler": 0.012103080749511719, + "DMATilingProfiler": 0.0037560462951660156, + "DataLocalityOpt": 0.07645320892333984, + "DataStreaming": 0.03701519966125488, + "DeConcat": 0.0018520355224609375, + "DeadCodeElimination": 0.0020148754119873047, + "DeadStoreElimination": 0.006912708282470703, + "DelinearIndices": 0.004647254943847656, + "Delinearization": 0.003908872604370117, + "DoNothing": 7.414817810058594e-05, + "DramToDramTranspose": 0.02015542984008789, + "DumpGraphAndMetadata": 0.08691883087158203, + "EliminateDivs": 0.0025060176849365234, + "ExpandBatchNorm": 0.0027189254760742188, + "ExpandISAMacro": 0.010967254638671875, + "FactorizeBlkDims": 0.009678840637207031, + "FactorizeThreadAxesInFreeDims": 0.0023202896118164063, + "FlattenMacroLoop": 0.00232696533203125, + "GenericAccessSimplifier": 0.0008094310760498047, + "InferInitValue": 0.02833867073059082, + "InferIntrinsicOnCC": 0.008923768997192383, + "InferNeuronTensor": 0.025766372680664063, + "InferNonlocalTensors": 0.014599800109863281, + "InferPSumTensor": 0.28374218940734863, + "InlineNativeKernels": 0.00860905647277832, + "InsertIOTransposes": 0.01989889144897461, + "InsertLocalTransposes": 0.004229307174682617, + "InsertOffloadedTransposes": 0.0029871463775634766, + "LICM": 0.0030870437622070313, + "LateLegalizeInst": 0.01364445686340332, + "LateLegalizePostSplit": 0.014872312545776367, + "LateLowerReshapeOp": 0.0010464191436767578, + "LateLowerTensorOp": 0.002707242965698242, + "LateNeuronInstComb": 0.01008152961730957, + "LayoutPreprocessing": 0.026853561401367188, + "LayoutPreprocessingAndAnalysis": 0.0556035041809082, + "LayoutRequirementAnalysis": 0.004946470260620117, + "LegalizeCCOpLayout": 0.0025353431701660156, + "LegalizeOpLevelAlias": 0.0018966197967529297, + "LegalizePartitionReduce": 0.0017490386962890625, + "LegalizeSundaAccess": 0.0763850212097168, + "LegalizeSundaMacro": 0.012125253677368164, + "LegalizeType": 0.012414693832397461, + "LocalLayoutOpt": 0.013860225677490234, + "LoopFusion": 0.005201578140258789, + "LoopSplitting": 0.0003204345703125, + "LowerBroadcast": 0.0018322467803955078, + "LowerCCOpBlockAxis": 0.0040171146392822266, + "LowerComplexBroadcast": 0.002280712127685547, + "LowerIntrinsics": 0.3141806125640869, + "LowerTensorOp": 0.01141357421875, + "LowerTranspose": 0.012679815292358398, + "MacroGeneration": 0.034410953521728516, + "MaskPropagation": 0.0028192996978759766, + "MemcpyElimination": 0.02788853645324707, + "MutateDataType": 0.0012311935424804688, + "NeuronAliasDependencyInduction": 0.0001773834228515625, + "NeuronAliasDependencyReset": 0.024976015090942383, + "NeuronInstComb": 0.004675865173339844, + "NeuronLICM": 0.03631091117858887, + "NeuronLoopFusion": 0.008457422256469727, + "NeuronLoopInterchange": 0.001413106918334961, + "NeuronSimplifier": 0.007856369018554688, + "NeuronSimplifyPredicates": 0.11957573890686035, + "NeuronValueNumbering": 0.004334449768066406, + "OptimizeAliasedCopyChain": 0.0006341934204101563, + "OptimizeNKIKernels": 0.38834357261657715, + "PAGLayoutOpt": 0.0889735221862793, + "PComputeCutting": 0.005109071731567383, + "PGLayoutTilingPipeline": 0.6248171329498291, + "PGTiling": 0.1645822525024414, + "PadElimination": 0.0003485679626464844, + "ParAxesAnnotation": 0.05196070671081543, + "PartialLoopFusion": 0.011112451553344727, + "PartialSimdFusion": 0.012138128280639648, + "PerfectLoopNest": 0.002288341522216797, + "RecognizeOpIdiom": 0.0041277408599853516, + "Recompute": 0.00026416778564453125, + "RelaxPredicates": 0.01356959342956543, + "Rematerialization": 0.0024864673614501953, + "ReshapeWeights": 0.0007522106170654297, + "ResolveAccessConflict": 0.0048482418060302734, + "ResolveComplicatePredicates": 0.0015094280242919922, + "RewriteReplicationMatmul": 0.0015668869018554688, + "RewriteWeights": 0.0027174949645996094, + "SFKVectorizer": 0.27501797676086426, + "SimpleAllReduceTiling": 0.009322166442871094, + "Simplifier": 0.003630399703979492, + "SimplifyMacroPredicates": 0.011396646499633789, + "SimplifyNeuronTensor": 1.0555970668792725, + "SimplifySlice": 0.0023348331451416016, + "SimplifyTensor": 0.005601167678833008, + "SpillPSum": 0.013073921203613281, + "SplitAPUnionSets": 0.11336159706115723, + "SplitAccGrp": 0.001394510269165039, + "StaticProfiler": 0.014252662658691406, + "StaticTransposeLocalTensor": 0.003930330276489258, + "SundaISel": 0.04436635971069336, + "TCTransform": 0.0008757114410400391, + "TensorInitialization": 0.01558232307434082, + "TensorOpSimplifier": 0.004608869552612305, + "TensorOpTransform": 0.01923346519470215, + "TileCCOps": 0.005507707595825195, + "TilingProfiler": 0.007405757904052734, + "TransformConvOp": 0.0030219554901123047, + "TritiumFusion": 0.05425119400024414, + "ValueNumbering": 0.0020017623901367188, + "VectorizeDMA": 0.002228975296020508, + "VectorizeMatMult": 0.006806135177612305, + "WeightCoalescing": 0.008460044860839844, + "ZeroSizeTensorElimination": 0.00014281272888183594 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 49538.0, + "StaticProfiler::AifUb": 304.240234375, + "StaticProfiler::ArithmeticIntensityTensorizer": 278.67474365234375, + "StaticProfiler::AverageDmaLength": 1974.1033935546875, + "StaticProfiler::AverageFractalPeUtilization": 99.7004623413086, + "StaticProfiler::AveragePartitionUtilization": 97.94140625, + "StaticProfiler::AveragePeUtilization": 98.78884887695313, + "StaticProfiler::DDRTransferBytes": 862646080.0, + "StaticProfiler::InternalTransferBytes": 669456896.0, + "StaticProfiler::LoadExpanded": 390679.0, + "StaticProfiler::LocalizationEfficiency": 91.59693145751953, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 95.863037109375, + "StaticProfiler::StoreExpanded": 7261.0, + "StaticProfiler::TotalDMAExpanded": 397940.0, + "StaticProfiler::TotalDynamicInstancesCount": 59578.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 59132.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 28224.0, + "TilingProfiler::NumPfTransposes": 5.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 19777.0, + "TilingProfiler::PfTransposeInstructionsForIo": 19008.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 768.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 6.0, + "TilingProfiler::SimdInstructionsAfterTiling": 303.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg01": { + "compiletime": { + "CanonicalizeConv": 2.2000000171829015e-05, + "CanonicalizeForTensorizer": 1.4999999621068127e-05, + "Canonicalizer": 0.00025499999173916876, + "HoistCompute": 1.9999999949504854e-06, + "IdentifyCrossPassTensors": 2.499999936844688e-05, + "MemcastMotion": 7.000000096013537e-06, + "PenguinizeFunctions": 1.4999999621068127e-05, + "PruneFunctions": 1.8999999156221747e-05, + "RemoveOptimizationBarriers": 2.700000004551839e-05, + "ScatterMotion": 1.9999999494757503e-05, + "TensorizerLegalizationPass": 1.9999999494757503e-05, + "VerifySupportedOps": 1.1000000085914508e-05, + "algsimp": 6.299999949987978e-05, + "batchnorm_expander": 1.4000000192027073e-05, + "boundary-marker-removal": 4.999999873689376e-06, + "call-inliner": 9.000000318337698e-06, + "canonicalize-boundary-marker": 6.000000212225132e-06, + "collective-stream-id-checker": 4.999999873689376e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 2.5999999706982635e-05, + "conditional-to-select": 4.999999873689376e-06, + "config-lowering": 2.2000000171829015e-05, + "constant_folding": 9.000000318337698e-06, + "cse": 1.2000000424450263e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.999999989900971e-06, + "emit-offloaded-dropout": 1.2999999853491317e-05, + "flatten-call-graph": 9.000000318337698e-06, + "fuse-send-recv": 2.9999999242136255e-05, + "hilo::LegalizeAlias": 4.999999873689376e-06, + "hilo::NeuronInstCombine": 3.600000127335079e-05, + "hilo::NeuronOpFusion": 1.4000000192027073e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 2.099999983329326e-05, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 1.4000000192027073e-05, + "hilo::VerifyAliasing": 1.9999999949504854e-06, + "hlo-mac-count": 4.600000102072954e-05, + "hlo-verifier": 0.00023299999884329736, + "legalize-ccops": 9.999999974752427e-07, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.999999989900971e-06, + "map-inline": 1.1000000085914508e-05, + "metadata-naming": 2.700000004551839e-05, + "mlir::detail::OpToOpPassAdaptor": 0.00017299999308306724, + "mlir::hlo::MhloToPyPenguin": 0.0009840000420808792, + "mlir::mhlo::LowerComplexExtraPass": 9.600000339560211e-05, + "mlir::mhlo::LowerComplexPass": 0.00013600000238511711, + "native-to-custom-softmax": 6.000000212225132e-06, + "native-to-custom-softmax-dx": 2.2000000171829015e-05, + "operand_upcaster": 2.4000000848900527e-05, + "post-par-pipe-begin": 3.000000106112566e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0005660000024363399, + "replace-minimum-constant": 6.000000212225132e-06, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 4.8999998398358e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 9.000000318337698e-06, + "tuple-simplifier": 4.999999873689376e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 834.6854858398438, + "HloMacCount": 103079215104.0, + "Traffic": 246989344.0 + } + }, + "sg02": { + "compiletime": { + "CanonicalizeConv": 0.0, + "CanonicalizeForTensorizer": 1.4000000192027073e-05, + "Canonicalizer": 0.0003129999968223274, + "HoistCompute": 9.999999974752427e-07, + "IdentifyCrossPassTensors": 1.2000000424450263e-05, + "MemcastMotion": 0.0, + "PenguinizeFunctions": 9.999999747378752e-06, + "PruneFunctions": 7.999999979801942e-06, + "RemoveOptimizationBarriers": 9.000000318337698e-06, + "ScatterMotion": 3.000000106112566e-06, + "TensorizerLegalizationPass": 6.000000212225132e-06, + "VerifySupportedOps": 1.2000000424450263e-05, + "algsimp": 5.999999848427251e-05, + "batchnorm_expander": 1.2999999853491317e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 9.999999747378752e-06, + "canonicalize-boundary-marker": 4.999999873689376e-06, + "collective-stream-id-checker": 3.999999989900971e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 2.5999999706982635e-05, + "conditional-to-select": 7.000000096013537e-06, + "config-lowering": 2.5999999706982635e-05, + "constant_folding": 9.000000318337698e-06, + "cse": 1.2000000424450263e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.000000106112566e-06, + "emit-offloaded-dropout": 1.2999999853491317e-05, + "flatten-call-graph": 1.2000000424450263e-05, + "fuse-send-recv": 1.9999999494757503e-05, + "hilo::LegalizeAlias": 1.9999999949504854e-06, + "hilo::NeuronInstCombine": 6.800000119255856e-05, + "hilo::NeuronOpFusion": 3.999999989900971e-06, + "hilo::ReplaceTokenTypeWithU8Pass": 2.2000000171829015e-05, + "hilo::ScheduleFusion": 0.00015900000289548188, + "hilo::SixtyFourHack": 3.9999998989515007e-05, + "hilo::VerifyAliasing": 9.999999974752427e-07, + "hlo-mac-count": 0.00017699999443721026, + "hlo-verifier": 0.0001829999964684248, + "legalize-ccops": 1.9999999949504854e-06, + "legalize-compare": 3.000000106112566e-06, + "lower-argminmax-custom-call": 3.000000106112566e-06, + "map-inline": 1.2000000424450263e-05, + "metadata-naming": 1.5999999959603883e-05, + "mlir::detail::OpToOpPassAdaptor": 1.1000000085914508e-05, + "mlir::hlo::MhloToPyPenguin": 0.0008900000248104334, + "mlir::mhlo::LowerComplexExtraPass": 8.800000068731606e-05, + "mlir::mhlo::LowerComplexPass": 0.00011999999696854502, + "native-to-custom-softmax": 6.000000212225132e-06, + "native-to-custom-softmax-dx": 2.4000000848900527e-05, + "operand_upcaster": 1.5999999959603883e-05, + "post-par-pipe-begin": 1.9999999949504854e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0005510000046342611, + "replace-minimum-constant": 9.000000318337698e-06, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 4.400000034365803e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 6.299999949987978e-05, + "tuple-simplifier": 4.999999873689376e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 194.41075134277344, + "HloMacCount": 77620576256.0, + "Traffic": 798521408.0 + } + } +} \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk3/graph.neff b/context_encoding_model/_tp0_bk3/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..f28021a86f6462720c45f0d0c5e263d0bea73428 --- /dev/null +++ b/context_encoding_model/_tp0_bk3/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3602ab29177b01531c0dbdb62bc869556ef53a934ba98dd3bd846e75e171cc3a +size 2561024 diff --git a/context_encoding_model/_tp0_bk3/log-neuron-cc.txt b/context_encoding_model/_tp0_bk3/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..be38d882a4a0c5ba944dcd949b34851ae069aeb9 --- /dev/null +++ b/context_encoding_model/_tp0_bk3/log-neuron-cc.txt @@ -0,0 +1,5272 @@ +2025-08-07T13:53:51Z INFO 47918 [root]: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb --output /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma' --lnc=1 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=/home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/log-neuron-cc.txt --verbose=35 +2025-08-07T13:53:51Z INFO 47918 [root]: NeuronX Compiler version 2.20.9961.0+0acef03a Python version 3.10.12 HWM version 2.20.0.9961+0acef03a NumPy version 1.26.4 Running on AMI ami-040348201d80b58ad Running in region usw2-az4 +2025-08-07T13:53:51Z INFO 48502 [root]: XLA detected +2025-08-07T13:53:51Z INFO 48502 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-08-07T13:53:51Z INFO 48502 [root]: Intermediate files stored in /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/neuronxcc-dfnjq5y6, output in /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3 +2025-08-07T13:53:51Z INFO 48502 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-08-07T13:53:51Z INFO 48502 [pipeline.Pipeline.0]: Processing input #0 +2025-08-07T13:53:51Z INFO 48502 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-08-07T13:53:51Z INFO 48502 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-08-07T13:53:51Z INFO 48502 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-08-07T13:53:51Z INFO 48502 [job.HLOToTensorizer.0]: Processing input #0 +2025-08-07T13:53:51Z INFO 48502 [job.HLOToTensorizer.0]: IR signature: 9068f3ba4f55e1b8b35adde74efc6a9e617baa344783aaee62353f9181c3092c for model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb +2025-08-07T13:53:51Z INFO 48502 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --layers-per-module=1 --partition --emit-tensor-level-dropout-ops --modular-flow-mac-threshold=10 --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-08-07T13:53:52Z INFO 48502 [job.HLOToTensorizer.0]: DEBUG: needsModular_PreSplit? Yes. macCnt 3711162974208 threshold 4398046511104 num non-trivial Ops 3871 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 38 + +Pre-Partition Pre-Opt Histogram: +total HLO instructions: 10617 + reshape 2091 19.69% ################################################################ + broadcast 1731 16.30% #################################################### + convert 1281 12.07% ####################################### + transpose 1268 11.94% ###################################### + constant 815 7.68% ######################## + parameter 475 4.47% ############## + slice 445 4.19% ############# + add 365 3.44% ########### + multiply 327 3.08% ########## + dot 326 3.07% ######### + get-tuple-element 295 2.78% ######### + select 255 2.40% ####### + compare 222 2.09% ###### + call 186 1.75% ##### + concatenate 148 1.39% #### + tuple 73 0.69% ## + scatter 73 0.69% ## + negate 72 0.68% ## + all-reduce 72 0.68% ## + custom-call 38 0.36% # + divide 37 0.35% # + iota 7 0.07% + gather 6 0.06% + all-gather 3 0.03% + reduce 3 0.03% + sine 1 0.01% + cosine 1 0.01% + maximum 1 0.01% + +INFO: IoStatistics: total inputs: 475 +INFO: IoStatistics: total outputs: 73 +INFO: IoStatistics: total passthrough tensors: 0 +INFO: IoStatistics: total outputs read from: 0 +INFO: IoStatistics: total redundant outputs: 0 +INFO: IoStatistics: total ifmap size (KiB): 8072802 +INFO: IoStatistics: total ofmap size (KiB): 73728 +INFO: IoStatistics: total must-alias size (KiB): 73728 +INFO: IoStatistics: total may-alias size (KiB): 0 +INFO: HloMacCount has found 3711162908672 +INFO: Traffic has found 8885483693 +INFO: AIF 835.33 + +Pre-Partition Post-Op Histogram: +total HLO instructions: 6623 + reshape 1424 21.50% ################################################################ + convert 992 14.98% ############################################ + transpose 941 14.21% ########################################## + constant 523 7.90% ####################### + parameter 475 7.17% ##################### + broadcast 410 6.19% ################## + dot 325 4.91% ############## + custom-call 223 3.37% ########## + multiply 219 3.31% ######### + add 219 3.31% ######### + get-tuple-element 151 2.28% ###### + slice 147 2.22% ###### + concatenate 146 2.20% ###### + select 110 1.66% #### + compare 76 1.15% ### + scatter 73 1.10% ### + negate 72 1.09% ### + all-reduce 72 1.09% ### + gather 6 0.09% + iota 5 0.08% + all-gather 3 0.05% + reduce 3 0.05% + pad 2 0.03% + sine 1 0.02% + divide 1 0.02% + tuple 1 0.02% + maximum 1 0.02% + rng 1 0.02% + cosine 1 0.02% + +INFO: Found compute bound graph +DEBUG: needsModular_PreSplit? Yes. macCnt 3711162908672 threshold 4398046511104 num non-trivial Ops 2702 +DEBUG: transformer model +INFO: Partitioner configs:ModularFlow BO LBL SA ConcatGraphs: 1 MaxDisj:2 MaxSep:4 LPM:1 +INFO: Markers NOT detected +Potential split-points stats: #CC 75 #AR 72 #AG 3 #BN 0 nClamp 0 +DEBUG: needsModular_SplitFinder? Yes. +ModuleSplitter initial partitioning... #parts 75 +ModuleSplitter initial partitioning... Done. +INFO: Num of unique Module Definitions: 6 +DEBUG: DefMap: 0 1 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 73 74 +New disjoint wave: start 2 len 70 NumReps: 35 macs 3607772528640 +INFO: Attempting to identify and split optimizer at end +First non-zero-mac/used part from the end is 73 +Not enough zero-mac parts. skip +INFO: Optimized 0 all-reduce split instructions +INFO: Number of splitPoints: 37 +ModuleSplitter initial partitioning... #parts 37 +ModuleSplitter initial partitioning... Done. +Remat: gather-iota 0 matches, 0 ops rematted +INFO: Alias legality verification of partitions PASSED. +INFO: No transposable_weight_idx attrs found +INFO: Peak intermediate memory demand is at Partition 1. Num live intermediates at peak is 9 and memory usage is 35127300 bytes. +INFO: Please refer to LiveRangeReport_PostHloPart.txt for detailed intermediate lifetime info. +DEBUG: DefMap: 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 36 +Wrote HLO netlist to hlo_netlist.json +Wrote graph partitions in debug_info_hlo_partitions.json +Processing partition 0 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 0 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 25769803776 +INFO: Traffic has found 705741606 +INFO: AIF 73.03 +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call dot gather get-tuple-element iota multiply negate parameter reshape scatter select sine slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 1 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 0 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 103079215104 +INFO: Traffic has found 246989348 +INFO: AIF 834.69 +HLO Ops used in computation: add all-reduce broadcast compare concatenate constant convert custom-call dot get-tuple-element multiply negate parameter reshape scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 2 +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 0 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 77620576256 +INFO: Traffic has found 798521419 +INFO: AIF 194.41 +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert custom-call divide dot gather get-tuple-element iota maximum multiply pad parameter reduce reshape rng scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass + +2025-08-07T13:53:52Z INFO 48502 [job.HLOToTensorizer.0]: IR signature: 4cb5bb30df98c0f4fe837212bb465c077814190c1515012736514ef3b85e9119 for sg0000/HLOToTensorizer +2025-08-07T13:53:52Z INFO 48502 [job.HLOToTensorizer.0]: IR signature: 7d62bccc8bf6f747c9f4be1d037998542e378a1e9c073d1354821dafa6e067fe for sg0001/HLOToTensorizer +2025-08-07T13:53:52Z INFO 48502 [job.HLOToTensorizer.0]: IR signature: f703193f38eab27445c0b7b02fa8c772086cee3728a75bd67c3dcc8214cedceb for sg0002/HLOToTensorizer +2025-08-07T13:53:52Z INFO 48502 [job.HLOToTensorizer.0]: Job #0 finished +2025-08-07T13:53:52Z INFO 48502 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-08-07T13:53:52Z INFO 48502 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-08-07T13:53:52Z INFO 48502 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-08-07T13:53:52Z INFO 48502 [job.Frontend.0]: Processing input #0 +2025-08-07T13:53:52Z INFO 48502 [job.Frontend.0]: Start model loading +2025-08-07T13:53:52Z INFO 48502 [job.Frontend.0]: Start tensorization +2025-08-07T13:53:52Z INFO 48502 [job.Frontend.0]: Num jobs: 128 +2025-08-07T13:53:52Z USER 48502 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-08-07T13:53:52Z INFO 48502 [Tensorizer]: Max workers: 3 +2025-08-07T13:53:52Z INFO 49124 [Tensorizer]: Building model from Penguin script "penguin.py.000000"... +2025-08-07T13:53:52Z INFO 49125 [Tensorizer]: Building model from Penguin script "penguin.py.000001"... +2025-08-07T13:53:52Z INFO 49126 [Tensorizer]: Building model from Penguin script "penguin.py.000002"... +2025-08-07T13:53:52Z INFO 49125 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-08-07T13:53:52Z INFO 49124 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-08-07T13:53:52Z INFO 49125 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:52Z INFO 49125 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49126 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49125 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 49125 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-08-07T13:53:52Z INFO 49125 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49125 [sg0001/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49125 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-08-07T13:53:52Z INFO 49125 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49125 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49125 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 49125 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49125 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49125 [sg0001/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-08-07T13:53:52Z INFO 49125 [sg0001/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.013 seconds +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.007 seconds +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.029 seconds +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.007 seconds +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-08-07T13:53:52Z INFO 49124 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.011 seconds +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.005 seconds +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.028 seconds +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.005 seconds +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/LegalizeCCOpLayout]: Finished (changed=True) +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.003 seconds +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.001 seconds +2025-08-07T13:53:52Z INFO 49126 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.013 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.019 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/LateLowerTensorOp]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.028 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.028 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.012 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.006 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.082 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.018 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.006 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.004 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.015 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/Rematerialization]: Running Rematerialization +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/Rematerialization]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/Rematerialization]: Rematerialization finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.028 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.001 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.015 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.009 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.033 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.010 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.010 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.109 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.029 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.013 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.033 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.005 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.008 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.029 seconds +2025-08-07T13:53:53Z INFO 49125 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.019 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:53Z INFO 49126 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.008 seconds +2025-08-07T13:53:53Z INFO 49124 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49124 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.008 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49124 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.038 seconds +2025-08-07T13:53:54Z INFO 49124 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 49124 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.104 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:54Z INFO 49124 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49124 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 49124 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49124 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49124 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 49124 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.029 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Rematerialization]: Running Rematerialization +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Rematerialization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Rematerialization]: Rematerialization finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.009 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.012 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:54Z INFO 49124 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49124 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:54Z INFO 49124 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49124 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.007 seconds +2025-08-07T13:53:54Z INFO 49124 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-08-07T13:53:54Z INFO 49124 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49124 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49124 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 49124 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.041 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.007 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.006 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/PadElimination]: Running PadElimination +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/PadElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/PadElimination]: PadElimination finished after 0.000 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/PadElimination]: Running PadElimination +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/PadElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/PadElimination]: PadElimination finished after 0.000 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.006 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.007 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Recompute]: Running Recompute +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Recompute]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49126 [Tensorizer]: After optimization: 38 statements +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/MutateDataType]: Running MutateDataType +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/MutateDataType]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/MutateDataType]: MutateDataType finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/TileCCOps]: Running TileCCOps +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/TileCCOps]: in float32 (512,) %'all_gather.2' = AllGatherOp-149 AllGather_add(float32 (256,) %'add.11', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.8843 | hlo_id: 101 | , id = 149 +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=2048 is not above min_allgather_tile_size_in_bytes=8388608` +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/TileCCOps]: in uint32 (512,) %'all_gather.3' = AllGatherOp-165 AllGather_add(uint32 (256,) %'add.12', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.8978 | hlo_id: 110 | , id = 165 +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/TileCCOps]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/TileCCOps]: TileCCOps finished after 0.006 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.007 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:54Z INFO 49125 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.007 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.001 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.009 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.014 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.027 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.056 seconds +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-08-07T13:53:54Z INFO 49126 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.015 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.000 seconds +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.052 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.089 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.006 seconds +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PGTiling]: Running PGTiling +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 585 of IO tensor {'CrossPassTensor': ''}bfloat16 %input471|NC|(128, 32) is not sorted, index list (w/ AG ids): [(16, 'AG49'), (13, 'AG50')] +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 586 of IO tensor {'CrossPassTensor': ''}bfloat16 %input472|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(16, 'AG49'), (13, 'AG50')] +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 587 of IO tensor {'CrossPassTensor': ''}bfloat16 %input470|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(16, 'AG49'), (13, 'AG50')] +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 588 of IO tensor {'CrossPassTensor': ''}bfloat16 %input469(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(10, 'AG54'), (15, 'AG52'), (11, 'AG53')] +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 589 of IO tensor {'CrossPassTensor': ''}bfloat16 %input474|NC|(128, 32) is not sorted, index list (w/ AG ids): [(16, 'AG49'), (13, 'AG50')] +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 540 of IO tensor {'CrossPassTensor': ''}bfloat16 %input473|NC|(75968, 32, 128) is not sorted, index list (w/ AG ids): [(14, 'AG59'), (13, 'AG50')] +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.018 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.000 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.034 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PGTiling]: PGTiling finished after 0.165 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.035 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/Recompute]: Running Recompute +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/Recompute]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 49125 [Tensorizer]: After optimization: 25 statements +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/MutateDataType]: Running MutateDataType +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/MutateDataType]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/MutateDataType]: MutateDataType finished after 0.001 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/TileCCOps]: Running TileCCOps +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/TileCCOps]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.020 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/TileCCOps]: TileCCOps finished after 0.006 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.013 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.020 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 0.625 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 19008: transpose_128x128 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 19008: matmul_128x128x1 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 3072: matmul_128x128x512 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 3072: matmul_128x128x512 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 3072: matmul_128x128x512 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 96: simd128x512 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 64: rmsnorm128x512x128 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 64: simd128x512 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 64: rmsnorm128x512x128 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 2: reduce512x1x1 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x512 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 2: reduce512x1x1 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 2: indirect_load128x1 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x1 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x1 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x1 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingBottleneck]: 1: indirect_load32x128 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.007 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.026 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.006 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/LICM]: LICM finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.010 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.007 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.001 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/TCTransform]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.010 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.005 seconds +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.024 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.076 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 19008: transpose_128x128 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 19008: matmul_128x128x1 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 3072: matmul_128x128x512 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 3072: matmul_128x128x512 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 3072: matmul_128x128x512 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 594: transpose_128x1 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 384: dma128x512 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 96: dma128x2048 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 96: dma128x2048 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 96: simd128x512 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 64: rmsnorm128x512x128 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 64: simd128x512 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 64: rmsnorm128x512x128 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x1024 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 2: reduce512x1x1 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 2: simd1x512 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 2: reduce512x1x1 +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.031 seconds +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-08-07T13:53:55Z INFO 49124 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.004 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.010 seconds +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:55Z INFO 49125 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.007 seconds +2025-08-07T13:53:55Z INFO 49126 [sg0002/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.012 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.008 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.031 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.007 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.064 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds +2025-08-07T13:53:56Z INFO 49124 [Tensorizer]: After optimization: 26 statements +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.001 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.031 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/TileCCOps]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.008 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.007 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.003 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.092 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.007 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.132 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.003 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.003 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.006 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/PGTiling]: Running PGTiling +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.022 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 669 of IO tensor {'CrossPassTensor': ''}bfloat16 %input86|NC|(128, 32) is not sorted, index list (w/ AG ids): [(15, 'AG88'), (11, 'AG89')] +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 670 of IO tensor {'CrossPassTensor': ''}bfloat16 %input87|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(15, 'AG88'), (11, 'AG89')] +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 671 of IO tensor {'CrossPassTensor': ''}bfloat16 %input85|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(15, 'AG88'), (11, 'AG89')] +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 672 of IO tensor {'CrossPassTensor': ''}bfloat16 %input84(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(7, 'AG93'), (14, 'AG91'), (8, 'AG92')] +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 673 of IO tensor {'CrossPassTensor': ''}bfloat16 %input90|NC|(128, 32) is not sorted, index list (w/ AG ids): [(15, 'AG88'), (11, 'AG89')] +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 674 of IO tensor {'CrossPassTensor': ''}bfloat16 %input94(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(15, 'AG88'), (11, 'AG89')] +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 679 of IO tensor {'CrossPassTensor': ''}bfloat16 %input92(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(15, 'AG88'), (11, 'AG89')] +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 680 of IO tensor {'CrossPassTensor': ''}bfloat16 %input89|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(15, 'AG88'), (11, 'AG89')] +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 447 of IO tensor {'CrossPassTensor': ''}bfloat16 %input88(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(2, 'AG103'), (0, 'AG99'), (1, 'AG98'), (3, 'AG102'), (4, 'AG101')] +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.034 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.010 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.005 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.010 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.007 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.017 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.011 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/InferInitValue]: Running InferInitValue +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.000 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/InferInitValue]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/InferInitValue]: InferInitValue finished after 0.028 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.009 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.008 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/SimplifyTensor]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.006 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/SundaISel]: Running SundaISel +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.005 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/SundaISel]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/SundaISel]: SundaISel finished after 0.044 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.000 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.025 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.002 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.033 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.007 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.123 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.128 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/PGTiling]: PGTiling finished after 0.452 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.024 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.004 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.006 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.008 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.042 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 0.933 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 3072: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 3072: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 3072: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 1024: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 1024: transpose_128x128 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 1024: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 256: softmax512x1x128 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 96: simd128x512 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 64: rmsnorm128x512x128 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 64: simd128x512 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 64: rmsnorm128x512x128 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 32: rmsnorm128x512x128 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingBottleneck]: 32: simd64x512 +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.150 seconds +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-08-07T13:53:56Z INFO 49124 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.016 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.001 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.013 seconds +2025-08-07T13:53:56Z INFO 49125 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.006 seconds +2025-08-07T13:53:56Z INFO 49126 [sg0002/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.010 seconds +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.056 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.028 seconds +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.009 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.008 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.005 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.005 seconds +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.006 seconds +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/DeConcat]: Running DeConcat +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/DeConcat]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/DeConcat]: DeConcat finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.012 seconds +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.330 seconds +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.150 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 3072: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 3072: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 3072: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 1024: transpose_128x128 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 384: dma128x512 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 256: softmax512x1x128 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: dma128x512 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 96: dma128x2048 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 96: dma128x2048 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 96: simd128x512 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: rmsnorm128x512x128 +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.008 seconds +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.368 seconds +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.054 seconds +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.005 seconds +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.009 seconds +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.006 seconds +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.007 seconds +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.006 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.011 seconds +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.014 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.006 seconds +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 702 of IO tensor {'CrossPassTensor': ''}bfloat16 %input79|NC|(128, 2, 2, 8) is not sorted, index list (w/ AG ids): [(16, 'AG97'), (11, 'AG100'), (9, 'AG99'), (13, 'AG98')] +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 703 of IO tensor {'CrossPassTensor': ''}bfloat16 %input83(4, 4, 128, 2, 2, 8, 2, 64) is not sorted, index list (w/ AG ids): [(16, 'AG97'), (11, 'AG100'), (9, 'AG99'), (13, 'AG98')] +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 704 of IO tensor {'CrossPassTensor': ''}bfloat16 %input81(4, 128, 2, 2, 8, 2, 64) is not sorted, index list (w/ AG ids): [(16, 'AG97'), (11, 'AG100'), (9, 'AG99'), (13, 'AG98')] +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 705 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NHWC|(4, 128, 2, 2, 8, 128) is not sorted, index list (w/ AG ids): [(16, 'AG97'), (11, 'AG100'), (9, 'AG99'), (13, 'AG98')] +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 459 of IO tensor {'CrossPassTensor': ''}bfloat16 %input77(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(10, 'AG111'), (5, 'AG107'), (8, 'AG106'), (12, 'AG110'), (14, 'AG109')] +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 569 of IO tensor {'IntermediateTensor': ''}bfloat16 %intermediate1(1024, 2, 2, 8, 128) is not sorted, index list (w/ AG ids): [(15, 'AG101'), (11, 'AG100'), (9, 'AG99'), (13, 'AG98')] +2025-08-07T13:53:57Z INFO 49126 [sg0002/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.082 seconds +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.013 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.005 seconds +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.014 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.009 seconds +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.000 seconds +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.011 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.007 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/InferInitValue]: Running InferInitValue +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.081 seconds +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 0.270 seconds +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/InferInitValue]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/InferInitValue]: InferInitValue finished after 0.043 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.016 seconds +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.013 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/SimplifyTensor]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.007 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/LICM]: LICM finished after 0.004 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/SundaISel]: Running SundaISel +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/SundaISel]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/SundaISel]: SundaISel finished after 0.045 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.000 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.020 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.035 seconds +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 1.321 seconds +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 1024: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 1024: transpose_128x128 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 1024: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 256: softmax512x1x128 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 64: simd128x512 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 64: rmsnorm128x512x128 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 32: indirect_load128x512 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 32: rmsnorm128x512x128 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 32: simd128x256 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 32: simd128x256 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 32: simd128x512 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.023 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.016 seconds +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.017 seconds +2025-08-07T13:53:57Z INFO 49124 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.009 seconds +2025-08-07T13:53:57Z INFO 49125 [sg0001/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.022 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.051 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.009 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.061 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.008 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.014 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.009 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.001 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/DeConcat]: Running DeConcat +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/DeConcat]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/DeConcat]: DeConcat finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.013 seconds +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.114 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_128x128x512 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: transpose_128x128 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_128x128x512 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: softmax512x1x128 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x512 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: dma128x512 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: dma128x512 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: rmsnorm128x512x128 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: indirect_load128x512 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x1024 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x2048 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x512x128 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: simd128x256 +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.043 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.005 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.012 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.010 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.010 seconds +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.001 seconds +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/SpillPSum]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.012 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/SpillPSum]: SpillPSum finished after 0.013 seconds +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.011 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.124 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.004 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.021 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.004 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.019 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.016 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.021 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.006 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 0.035 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.013 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.012 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.015 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/SpillPSum]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.007 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/SpillPSum]: SpillPSum finished after 0.023 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.043 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LegalizeType]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 0.046 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.000 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.022 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.002 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.314 seconds +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.010 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.009 seconds +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/LegalizeType]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LegalizeType]: LegalizeType finished after 0.005 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.018 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.010 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/LegalizeType]: LegalizeType finished after 0.012 seconds +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.014 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.046 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/WeightCoalescing]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.026 seconds +2025-08-07T13:53:58Z INFO 49124 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.036 seconds +2025-08-07T13:53:58Z INFO 49126 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.016 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.003 seconds +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-08-07T13:53:58Z INFO 49125 [sg0001/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.005 seconds +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/SimplifyNeuronTensor]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.047 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.010 seconds +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DataStreaming]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DataStreaming]: DataStreaming finished after 0.005 seconds +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.010 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.007 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.009 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.001 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.001 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.001 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.035 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.284 seconds +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.008 seconds +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.082 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.201 seconds +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.018 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.018 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.076 seconds +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.005 seconds +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/SimpleAllReduceTiling]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.014 seconds +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 1.041ms (48.000MiB, est bw: 48.348GB/s, 45.083% of tot. time) for bfloat16<128 x 128> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 24, 128, 512) %'input84_local_915'[i15_0_0_921_0_0_1176,i15_0_0_921_0_1_1176,i15_0_0_1,c1_909,c2_910,i0.128,i1.128+128p_1377] = load bfloat16<128 x 128> {'CrossPassTensor': ''}bfloat16 (8, 4, 2, 128, 24, 128) %'input84'[4i15_0_0_921_0_0_1176+2i15_0_0_921_0_1_1176+i15_0_0_1,p_1377,c1_909,i0.128,c2_910,i1.128] # id=1086, src_id=None, , instances=1536 # dl = tensor_op_name: _dot.6 | hlo_id: 49 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 488.243us (96.000MiB, est bw: 206.175GB/s, 21.144% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 24, 2, 128, 2048) %1177[i11_0,i10_0_0,i10_0_1,c2_890,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 24, 128, 4096) %'input85'[i10_0_0,i10_0_1,i0.128,i1.2048+2048c2_890] # id=1077, src_id=None, , instances=192 # dl = tensor_op_name: _dot.4 | hlo_id: 39 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/TensorInitialization]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 244.771us (48.000MiB, est bw: 205.627GB/s, 10.600% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 24, 2, 128, 2048) %'input87_local_905'[i12_0_0,4i12_0_1_0+i12_0_1_1,c2_900,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 24, 128, 4096) %'input87'[i12_0_0,4i12_0_1_0+i12_0_1_1,i0.128,i1.2048+2048c2_900] # id=1080, src_id=None, , instances=96 # dl = tensor_op_name: _dot.5 | hlo_id: 30 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 163.614us (32.000MiB, est bw: 205.083GB/s, 7.086% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 4, 4, 2, 128, 2048) %1178[i40_0,i41_0,i41_1,c2_931,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (4, 4, 128, 4096) %'input94'[i41_0,i41_1,i0.128,i1.2048+2048c2_931] # id=1100, src_id=None, , instances=64 # dl = tensor_op_name: _dot.9 | hlo_id: 67 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 99.175us (16.000MiB, est bw: 169.167GB/s, 4.295% of tot. time) for bfloat16<128 x 1024> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 4, 512) %'input88_local_1001'[i115_0_0_0_1007_0_0_1179,i115_0_0_0_1007_0_1_1179,i115_0_0_0_1,c1_994_1793,i0.128,i3.4,i1.128+128i2.2+256p_1392_1793] = load bfloat16<128 x 1024> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 4, 4, 2, 128) %'input88'[4i115_0_0_0_1007_0_0_1179+2i115_0_0_0_1007_0_1_1179+i115_0_0_0_1,p_1392_1793,i0.128,c1_994_1793,i3.4,i2.2,i1.128] # id=1156, src_id=None, , instances=64 # dl = tensor_op_name: _dot.10 | hlo_id: 165 | [[i0.128];[i1.128, i2.2, i3.4]] -> [[i0.128];[i1.128, i2.2, i3.4]] +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 1.814% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 4, 2, 128, 2048) %'842.1337'[i11_0,T_i1_0,T_i2_0_1790,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (2, 4, 128, 4096) %'add.4'[i11_0,T_i1_0,i0.128,i1.2048+2048T_i2_0_1790] # id=1181, src_id=None, , instances=16 # dl = tensor_op_name: add.4_pftranspose_842 | hlo_id: 17 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 1.814% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 2048) %'846.1342'[i40_0,T_i17_0_854_0,2T_i1_0_0_1791+T_i1_0_1_1791,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (4194304,) %'all_reduce.1-buffer-1828'[2097152i40_0+4096i0.128+2048T_i17_0_854_0+i1.2048+1048576T_i1_0_0_1791+524288T_i1_0_1_1791] # id=1190, src_id=None, , instances=16 # dl = tensor_op_name: all_reduce.1_pftranspose_846 | hlo_id: 52 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 30.524us (8.000MiB, est bw: 274.819GB/s, 1.322% of tot. time) for bfloat16<128 x 1024> non_local bfloat16 (4194304,) %'dot.7-buffer-1826'[2048i15_0_0_921_0_0_1176+4096i0.128+1024i15_0_0_921_0_1_1176+i1.1024+2097152i16_0_0_921_1176+524288i16_0_1_921_1176] = store bfloat16<128 x 1024> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 1024) %922[i15_0_0_921_0_0_1176,i15_0_0_921_0_1_1176,i16_0_0_921_1176,i16_0_1_921_1176,i0.128,i1.1024] # id=1089, src_id=None, , instances=32 # dl = tensor_op_name: _dot.6 | hlo_id: 49 | [[i0.128];[i1.1024]] -> [[i0.128];[i1.1024]] +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 30.524us (8.000MiB, est bw: 274.819GB/s, 1.322% of tot. time) for bfloat16<128 x 1024> non_local bfloat16 (4194304,) %'dot.11-buffer-1831'[2048i115_0_0_0_1007_0_0_1179+4096i0.128+1024i115_0_0_0_1007_0_1_1179+i1.1024+2097152i116_0_0_1007_1179+524288i116_0_1_1007_1179] = store bfloat16<128 x 1024> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 1024) %1008[i115_0_0_0_1007_0_0_1179,i115_0_0_0_1007_0_1_1179,i116_0_0_1007_1179,i116_0_1_1007_1179,i0.128,i1.1024] # id=1159, src_id=None, , instances=32 # dl = tensor_op_name: _dot.10 | hlo_id: 165 | [[i0.128];[i1.1024]] -> [[i0.128];[i1.1024]] +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 25.532us (8.000MiB, est bw: 328.547GB/s, 1.106% of tot. time) for bfloat16<128 x 2048> {'IntermediateTensor': ''}bfloat16 (1, 2, 4, 128, 4096) %'intermediate6'(init=0.0)[0,i40_0,2T_i18_1_0_854_0+T_i18_1_0_854_1,i0.128,2048T_i17_0_854_0+i1.2048] = store bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 2048) %'850.1397'[i40_0,T_i17_0_854_0,2T_i18_1_0_854_0+T_i18_1_0_854_1,i0.128,i1.2048] # id=1194, src_id=None, , instances=16 # dl = tensor_op_name: intermediate6_pftranspose_850 | hlo_id: 2 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.016 seconds +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/OptimizeNKIKernels]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.027 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.008 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.043 seconds +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.013 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.020 seconds +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.120 seconds +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.003 seconds +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.011 seconds +2025-08-07T13:53:59Z INFO 49126 [sg0002/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.017 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49125 [sg0001/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.044 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.022 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:53:59Z INFO 49125 [Tensorizer]: BirCodeGen estimate #instances=20003 in sg0001 +2025-08-07T13:53:59Z INFO 49125 [Tensorizer]: IR signature: fa0435b1d147525f8e0db1c0594a5b376a85965783d94d09e0944cb7850cde48 for nc00/sg0001/TensorizerBIR +2025-08-07T13:53:59Z INFO 49125 [Tensorizer]: Weights total number of bytes: 196608 +2025-08-07T13:53:59Z INFO 49125 [Tensorizer]: Successfully built model. +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.040 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.002 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.012 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.054 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.003 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.040 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.004 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.022 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.017 seconds +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:53:59Z INFO 49124 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.003 seconds +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.015 seconds +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DataStreaming]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.006 seconds +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.277 seconds +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.007 seconds +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.005 seconds +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.002 seconds +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 163.614us (32.000MiB, est bw: 205.083GB/s, 29.598% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 4, 4, 2, 128, 2048) %1536[i47_0_0,i48_0_1535,i32_0_0_1,c2_1255,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (4, 4, 128, 2, 2048) %'input83'[i48_0_1535,i32_0_0_1,i0.128,c2_1255,i1.2048] # id=1415, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2 | hlo_id: 34 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 99.175us (16.000MiB, est bw: 169.167GB/s, 17.941% of tot. time) for bfloat16<128 x 1024> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 4, 512) %'input77_local_1302'[i122_0_0_0_1308_0_0_1537,i122_0_0_0_1308_0_1_1537,i122_0_0_0_1,c1_1295_2205,i0.128,i3.4,i1.128+128i2.2+256p_1838_2205] = load bfloat16<128 x 1024> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 4, 4, 2, 128) %'input77'[4i122_0_0_0_1308_0_0_1537+2i122_0_0_0_1308_0_1_1537+i122_0_0_0_1,p_1838_2205,i0.128,c1_1295_2205,i3.4,i2.2,i1.128] # id=1520, src_id=None, , instances=64 # dl = tensor_op_name: _dot.3 | hlo_id: 145 | [[i0.128];[i1.128, i2.2, i3.4]] -> [[i0.128];[i1.128, i2.2, i3.4]] +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 50.238us (8.000MiB, est bw: 166.979GB/s, 9.088% of tot. time) for bfloat16<128 x 1024> TongaSB partitions[3] bfloat16 (2, 2, 8, 128, 2, 512) %'intermediate1_pftranspose_1180'[T_i1_1_0_1184,T_i1_0_1184,i1_1_1_0_2202,i0.128,i2.2,i1.512] = load bfloat16<128 x 1024> DRAM2DBlk partitions[1] bfloat16 (2, 1, 2, 8, 128, 1024) %'all_gather.1'[T_i1_1_0_1184,0,T_i1_0_1184,i1_1_1_0_2202,i0.128,i1.512+512i2.2] # id=1374, src_id=None, , instances=32 # dl = tensor_op_name: UnnamedModule | hlo_id: 1 | [[i0.128];[i1.512, i2.2]] -> [[i0.128];[i1.512, i2.2]] +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 7.576% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 8, 128, 2, 1024) %'all_gather.1_local_1239'[i29_0_1_0_1243,i29_0_1_1_1243,i0.128,i2.2,i1.1024] = load bfloat16<128 x 2048> DRAM2DBlk partitions[1] bfloat16 (2, 1, 2, 8, 128, 1024) %'all_gather.1'[i29_0_1_0_1243,0,i2.2,i29_0_1_1_1243,i0.128,i1.1024] # id=1410, src_id=None, , instances=16 # dl = tensor_op_name: _custom-call.226 | hlo_id: 27 | [[i0.128];[i1.1024, i2.2]] -> [[i0.128];[i1.1024, i2.2]] +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 30.524us (8.000MiB, est bw: 274.819GB/s, 5.522% of tot. time) for bfloat16<128 x 1024> {'IntermediateTensor': ''}bfloat16 (2, 4, 128, 2, 2, 1024) %'intermediate1'(init=0.0)[T_i0_0_1184,T_i0_1_1184_0,i0.128,T_i1_0_1184,T_i1_1_0_1184,i1.1024] = store bfloat16<128 x 1024> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 1024) %'1180.1850'[T_i1_1_0_1184,T_i1_0_1184,T_i0_0_1184,T_i0_1_1184_0,i0.128,i1.1024] # id=1553, src_id=None, , instances=32 # dl = tensor_op_name: intermediate1_pftranspose_1180 | hlo_id: 1 | [[i0.128];[i1.1024]] -> [[i0.128];[i1.1024]] +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 30.524us (8.000MiB, est bw: 274.819GB/s, 5.522% of tot. time) for bfloat16<128 x 1024> non_local bfloat16 (4194304,) %'dot.4-buffer-2238'[2048i122_0_0_0_1308_0_0_1537+4096i0.128+1024i122_0_0_0_1308_0_1_1537+i1.1024+2097152i123_0_0_1308_1537+524288i123_0_1_1308_1537] = store bfloat16<128 x 1024> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 1024) %1309[i122_0_0_0_1308_0_0_1537,i122_0_0_0_1308_0_1_1537,i123_0_0_1308_1537,i123_0_1_1308_1537,i0.128,i1.1024] # id=1523, src_id=None, , instances=32 # dl = tensor_op_name: _dot.3 | hlo_id: 145 | [[i0.128];[i1.1024]] -> [[i0.128];[i1.1024]] +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 25.769us (4.000MiB, est bw: 162.767GB/s, 4.662% of tot. time) for bfloat16<128 x 1024> TongaSB partitions[2] bfloat16 (2, 8, 128, 1024) %'transpose.1_pftranspose_1175'[T_i12_0_1179,i13_0,i0.128,i1.1024] = indirect_load bfloat16<128 x 1024> {'CrossPassTensor': ''}bfloat16 (151936, 2, 1024) %'input76'[i0.128,T_i12_0_1179,i1.1024] generic generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[0] int32 (128, 8, 1) %'input0_local_1215'[i0.128,i13_0,0] # id=1371, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=16 # dl = tensor_op_name: _gather.41 | hlo_id: 16 | [[i0.128];[i1.1024]] -> [[i0.128];[i1.1024]] +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 3.906% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (4, 2, 128, 2048) %'input81_local_1276'[i120_0_2206,c1_1270_2203_2206,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (4, 128, 2, 2048) %'input81'[i120_0_2206,i0.128,c1_1270_2203_2206,i1.2048] # id=1460, src_id=None, , instances=8 # dl = tensor_op_name: _dot.1 | hlo_id: 82 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 3.906% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (4, 2, 128, 2048) %'input78_local_1291'[i120_0_2207,c1_1285_2204_2207,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (4, 128, 2, 2048) %'input78'[i120_0_2207,i0.128,c1_1285_2204_2207,i1.2048] # id=1514, src_id=None, , instances=8 # dl = tensor_op_name: _dot | hlo_id: 131 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 15.912us (4.000MiB, est bw: 263.593GB/s, 2.879% of tot. time) for bfloat16<128 x 1024> DRAM2DBlk partitions[1] bfloat16 (2, 1, 2, 4, 128, 8, 128) %'transpose.1'[T_i12_0_1179,0,T_i12_1_0_1179,T_i12_1_1_1179_0,i0.128,i2.4+4i3.2,i1.128] = store bfloat16<128 x 1024> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 2, 512) %'1175.1848'[T_i12_0_1179,T_i12_1_0_1179,T_i12_1_1_1179_0,i0.128,i3.2,i1.128+128i2.4] # id=1540, src_id=None, , instances=16 # dl = tensor_op_name: transpose.1_pftranspose_1175 | hlo_id: 16 | [[i0.128];[i1.128, i2.4, i3.2]] -> [[i0.128];[i1.128, i2.4, i3.2]] +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.005 seconds +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.002 seconds +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.030 seconds +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.005 seconds +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.041 seconds +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.003 seconds +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.006 seconds +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-08-07T13:54:00Z INFO 49126 [sg0002/Tensorizer/SimplifyNeuronTensor]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-08-07T13:54:00Z INFO 49126 [sg0002/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 1.056 seconds +2025-08-07T13:54:00Z INFO 49126 [sg0002/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:54:00Z INFO 49126 [sg0002/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49124 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.063 seconds +2025-08-07T13:54:00Z INFO 49126 [sg0002/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.005 seconds +2025-08-07T13:54:00Z INFO 49126 [sg0002/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:54:00Z INFO 49126 [sg0002/Tensorizer/DataStreaming]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 49126 [sg0002/Tensorizer/DataStreaming]: DataStreaming finished after 0.037 seconds +2025-08-07T13:54:00Z INFO 49126 [sg0002/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:54:00Z INFO 49124 [Tensorizer]: BirCodeGen estimate #instances=12096 in sg0000 +2025-08-07T13:54:00Z INFO 49124 [Tensorizer]: IR signature: 2cf1e920f2ce14b5e1349e8b1c65714884a444126d124dbda7ee4f3cff30972e for nc00/sg0000/TensorizerBIR +2025-08-07T13:54:00Z INFO 49124 [Tensorizer]: Weights total number of bytes: 196864 +2025-08-07T13:54:00Z INFO 49124 [Tensorizer]: Successfully built model. +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.275 seconds +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.014 seconds +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.014 seconds +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/SimpleAllReduceTiling]: Finished (changed=True) +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.009 seconds +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 3.014ms (594.000MiB, est bw: 206.636GB/s, 57.816% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (594, 2, 128, 2048) %'700.1100'[i31_0,T_i1_0_2805,i0.128,i1.128+128i2.16] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (75968, 32, 128) %'input473'[128i31_0+i0.128,16T_i1_0_2805+i2.16,i1.128] # id=1099, src_id=None, , instances=1188 # dl = tensor_op_name: input473_pftranspose_700 | hlo_id: 90 | if -128i31_0-i0.128+75967 >= 0 [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 1.041ms (48.000MiB, est bw: 48.348GB/s, 19.968% of tot. time) for bfloat16<128 x 128> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 24, 128, 512) %'input469_local_773'[i15_0_0_779_0_0_1072,i15_0_0_779_0_1_1072,i15_0_0_1,c1_767,c2_768,i0.128,i1.128+128p_2181] = load bfloat16<128 x 128> {'CrossPassTensor': ''}bfloat16 (8, 4, 2, 128, 24, 128) %'input469'[4i15_0_0_779_0_0_1072+2i15_0_0_779_0_1_1072+i15_0_0_1,p_2181,c1_767,i0.128,c2_768,i1.128] # id=951, src_id=None, , instances=1536 # dl = tensor_op_name: _dot.256 | hlo_id: 59 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 488.243us (96.000MiB, est bw: 206.175GB/s, 9.365% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 24, 2, 128, 2048) %1073[i11_0,i10_0_0,i10_0_1,c2_748,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 24, 128, 4096) %'input470'[i10_0_0,i10_0_1,i0.128,i1.2048+2048c2_748] # id=942, src_id=None, , instances=192 # dl = tensor_op_name: _dot.254 | hlo_id: 49 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 244.771us (48.000MiB, est bw: 205.627GB/s, 4.695% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 24, 2, 128, 2048) %'input472_local_763'[i12_0_0,4i12_0_1_0+i12_0_1_1,c2_758,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 24, 128, 4096) %'input472'[i12_0_0,4i12_0_1_0+i12_0_1_1,i0.128,i1.2048+2048c2_758] # id=945, src_id=None, , instances=96 # dl = tensor_op_name: _dot.255 | hlo_id: 40 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 191.807us (297.000KiB, est bw: 1.586GB/s, 3.679% of tot. time) for float32<1 x 128> {'no_delinear': '0'}non_local float32 (1, 75968) %'convert.59'[0,128i31_0+i0.128] = store float32<1 x 128> TongaSB partitions[1] float32 (594, 1, 128) %'dot.257.1110'[i31_0,0,i0.128] # id=1108, src_id=None, , instances=594 # dl = tensor_op_name: _dot.257 | hlo_id: 90 | if -128i31_0-i0.128+75967 >= 0 [[];[i0.128]] -> [[];[i0.128]] +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 0.803% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 4, 2, 128, 2048) %'704.2160'[i11_0,T_i1_0,T_i2_0_2803,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (2, 4, 128, 4096) %'add.9'[i11_0,T_i1_0,i0.128,i1.2048+2048T_i2_0_2803] # id=1074, src_id=None, , instances=16 # dl = tensor_op_name: add.9_pftranspose_704 | hlo_id: 27 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 0.803% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 4, 2, 128, 2048) %'708.2165'[T_i20_0_716,T_i1_0,T_i2_0_2804,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (4194304,) %'all_reduce.3-buffer-2825'[2097152T_i20_0_716+4096i0.128+524288T_i1_0+i1.2048+2048T_i2_0_2804] # id=1083, src_id=None, , instances=16 # dl = tensor_op_name: all_reduce.3_pftranspose_708 | hlo_id: 62 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 30.524us (8.000MiB, est bw: 274.819GB/s, 0.585% of tot. time) for bfloat16<128 x 1024> non_local bfloat16 (4194304,) %'dot.14-buffer-2823'[2048i15_0_0_779_0_0_1072+4096i0.128+1024i15_0_0_779_0_1_1072+i1.1024+2097152i16_0_0_779_1072+524288i16_0_1_779_1072] = store bfloat16<128 x 1024> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 1024) %780[i15_0_0_779_0_0_1072,i15_0_0_779_0_1_1072,i16_0_0_779_1072,i16_0_1_779_1072,i0.128,i1.1024] # id=954, src_id=None, , instances=32 # dl = tensor_op_name: _dot.256 | hlo_id: 59 | [[i0.128];[i1.1024]] -> [[i0.128];[i1.1024]] +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 25.532us (8.000MiB, est bw: 328.547GB/s, 0.490% of tot. time) for bfloat16<128 x 2048> non_local bfloat16 (1024, 32, 128) %'convert.57'[512T_i20_0_716+i0.128+128T_i20_1_716_0,16T_i19_0_716_0_1170+i2.4+4i3.4,i1.128] = store bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 4, 512) %'712.2569'[T_i20_0_716,T_i19_0_716_0_1170,T_i20_1_716_0,i0.128,i3.4,i1.128+128i2.4] # id=1087, src_id=None, , instances=16 # dl = tensor_op_name: convert.57_pftranspose_712 | hlo_id: 70 | [[i0.128];[i1.128, i2.4, i3.4]] -> [[i0.128];[i1.128, i2.4, i3.4]] +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 22.647us (296.758KiB, est bw: 13.418GB/s, 0.434% of tot. time) for float32<1 x 15194> TongaSB partitions[1] float32 (5, 1, 15194) %'custom-call.411.1179'[i1,0,i0.15194] = load float32<1 x 15194> {'no_delinear': '0'}non_local float32 (1, 75968) %'convert.59'[15194i1+i0.15194] # id=1174, src_id=None, , instances=5 # dl = tensor_op_name: _custom-call.411 | hlo_id: 93 | if -15194i1-i0.15194+75967 >= 0 [[];[i0.15194]] -> [[];[i0.15194]] +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.012 seconds +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.388 seconds +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.033 seconds +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.014 seconds +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.113 seconds +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.015 seconds +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.087 seconds +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-08-07T13:54:01Z INFO 49126 [sg0002/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-08-07T13:54:02Z INFO 49126 [sg0002/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-08-07T13:54:02Z INFO 49126 [sg0002/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.477 seconds +2025-08-07T13:54:02Z INFO 49126 [Tensorizer]: BirCodeGen estimate #instances=106646 in sg0002 +2025-08-07T13:54:02Z INFO 49126 [Tensorizer]: IR signature: 8d77f77b258269ebb8a4baf6acd121a87992261d39af9caef170466eb257b177 for nc00/sg0002/TensorizerBIR +2025-08-07T13:54:02Z INFO 49126 [Tensorizer]: Weights total number of bytes: 135176 +2025-08-07T13:54:02Z INFO 49126 [Tensorizer]: Successfully built model. +2025-08-07T13:54:02Z USER 48502 [root/Tensorizer/Tensorizer]: Tensorizer finished after 10.055 seconds +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: End tensorization +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input76 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input0 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input79 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input83 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input82 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input1 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input81 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input80 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input78 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input77 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input4 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input2 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input5 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input86 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input87 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input85 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input84 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input90 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input94 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input93 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input92 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input91 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input89 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input88 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input6 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input2 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input7 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input471 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input472 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input470 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input469 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input474 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input1 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input473 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Network input: input3 +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: wrote bir.json +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: wrote tensor_map.json +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: wrote bir.json +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: wrote tensor_map.json +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: wrote bir.json +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: wrote tensor_map.json +2025-08-07T13:54:02Z INFO 48502 [job.Frontend.0]: Job #0 finished +2025-08-07T13:54:02Z INFO 48502 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-08-07T13:54:02Z INFO 48502 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-08-07T13:54:02Z INFO 48502 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-08-07T13:54:02Z INFO 48502 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-08-07T13:54:02Z INFO 48502 [job.WalrusDriver.0]: BackendDriver has 3 states with 1 core LNC +2025-08-07T13:54:02Z INFO 48502 [job.WalrusDriver.0]: BackendDriver MT cwd: /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/neuronxcc-dfnjq5y6 +2025-08-07T13:54:02Z INFO 48502 [job.BIRLinker.1]: Creating directory sgLnk/sg00 +2025-08-07T13:54:02Z INFO 48502 [job.WalrusDriver.0]: StateId sg00 Dir /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/neuronxcc-dfnjq5y6/sg00 +2025-08-07T13:54:02Z INFO 48502 [job.WalrusDriver.0]: StateId sg01 Dir /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/neuronxcc-dfnjq5y6/sg01 +2025-08-07T13:54:02Z INFO 48502 [job.WalrusDriver.0]: StateId sg02 Dir /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/neuronxcc-dfnjq5y6/sg02 +2025-08-07T13:54:02Z INFO 48502 [job.WalrusDriver.0]: Number of subgraphs to link: 3 +2025-08-07T13:54:02Z INFO 48502 [job.WalrusDriver.0]: lnkState: {"model": ["/home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/neuronxcc-dfnjq5y6/sgLnk/sg00", "state_id": "sgLnk"} +2025-08-07T13:54:02Z INFO 48502 [job.WalrusDriver.0]: BackendDriver in_state.num_states 3 with 1 core LNC +2025-08-07T13:54:02Z INFO 48502 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/log-neuron-cc.txt -o walrus_bir.out.json --enable-call-graph --enable-mt-backend --link-subgraphs sg00,sg01,sg02 --link-dir sgLnk/sg00 --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-lower-bound 0.14 --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --enable-internal-partitioner --dge-levels io,vector_dynamic_offsets,scalar_dynamic_offset --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.neff +2025-08-07T13:54:02Z INFO 48502 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/neuronxcc-dfnjq5y6 +2025-08-07T13:54:02Z INFO 48502 [job.WalrusDriver.0]: propagate_exit=True +2025-08-07T13:54:02Z INFO 48502 [job.WalrusDriver.0]: use_logger=False +2025-08-07T13:54:02Z INFO 48502 [job.WalrusDriver.0]: expose_stderr=True +2025-08-07T13:54:02Z INFO 49414 [Logging]: Logging to ../log-neuron-cc.txt at level 'INFO' +2025-08-07T13:54:02Z INFO 49414 [BackendDriver]: max_allowed_parallelism=128 +2025-08-07T13:54:02Z INFO 49414 [BackendDriver]: Loading module from sg00/bir.json +2025-08-07T13:54:02Z INFO 49414 [BackendDriver]: Loading module from sg01/bir.json +2025-08-07T13:54:02Z INFO 49414 [BackendDriver]: Loading module from sg02/bir.json +2025-08-07T13:54:02Z INFO 49414 [BackendDriver]: Backend driver mtBackend: true numModules: 3 Cwd: "/home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/neuronxcc-dfnjq5y6" +2025-08-07T13:54:02Z INFO 49414 [BackendDriver]: DynamicDMA is enabled +2025-08-07T13:54:02Z INFO 49414 [BackendDriver]: DynamicDMA levels being enabled: io, scalar_dynamic_offset, vector_dynamic_offsets, +2025-08-07T13:54:02Z INFO 49414 [BackendDriver]: Modular flow call graph is enabled +2025-08-07T13:54:02Z INFO 49414 [BackendDriver]: Internal partitioner is enabled +2025-08-07T13:54:02Z USER 49414 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:02Z INFO 49414 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=671 blocks=3 instructions=1038 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49414 (sg00) [ModuleForkPass]: Running do_nothing +2025-08-07T13:54:02Z USER 49414 (sg02) [ModuleForkPass]: Running do_nothing +2025-08-07T13:54:02Z USER 49414 (sg01) [ModuleForkPass]: Running do_nothing +2025-08-07T13:54:02Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=192 blocks=1 instructions=40 Max writers: 12 Max Readers: 11 +2025-08-07T13:54:02Z USER 49414 (sg00) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 80mb, ru_maxrss: 200mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 192 memory location(s), 1 block(s), and 40 instruction(s). Max writers: 12 Max Readers: 11 +2025-08-07T13:54:02Z USER 49414 (sg00) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:02Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=138 blocks=1 instructions=45 Max writers: 2 Max Readers: 9 +2025-08-07T13:54:02Z USER 49414 (sg01) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=192 blocks=1 instructions=40 Max writers: 12 Max Readers: 11 +2025-08-07T13:54:02Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 80mb, ru_maxrss: 200mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 138 memory location(s), 1 block(s), and 45 instruction(s). Max writers: 2 Max Readers: 9 +2025-08-07T13:54:02Z USER 49414 (sg01) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:02Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=138 blocks=1 instructions=45 Max writers: 2 Max Readers: 9 +2025-08-07T13:54:02Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=341 blocks=1 instructions=953 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49414 (sg02) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 80mb, ru_maxrss: 200mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 341 memory location(s), 1 block(s), and 953 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49414 (sg02) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:02Z WARNING 49414 [birverifier::InstVisitor]: (sg00) Non - output memory location with no reader: {convert.270.1874}@SB<0,0>(1x2)#Internal DebugInfo: +2025-08-07T13:54:02Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=341 blocks=1 instructions=953 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49414 (sg00) [ModuleForkPass]: birverifier finished after 0.016 seconds +2025-08-07T13:54:02Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 122mb, ru_maxrss: 200mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 192 memory location(s), 1 block(s), and 40 instruction(s). Max writers: 12 Max Readers: 11 +2025-08-07T13:54:02Z USER 49414 (sg01) [ModuleForkPass]: birverifier finished after 0.049 seconds +2025-08-07T13:54:02Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 208mb, ru_maxrss: 207mb (delta=7mb) +2025-08-07T13:54:02Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 138 memory location(s), 1 block(s), and 45 instruction(s). Max writers: 2 Max Readers: 9 +2025-08-07T13:54:02Z USER 49414 (sg02) [ModuleForkPass]: birverifier finished after 0.178 seconds +2025-08-07T13:54:02Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 341mb, ru_maxrss: 341mb (delta=141mb) +2025-08-07T13:54:02Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 341 memory location(s), 1 block(s), and 953 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49414 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:02Z USER 49414 [BackendPassManager]: mod_parallel_pass finished after 0.181 seconds +2025-08-07T13:54:02Z INFO 49414 [BackendPassManager]: curr_vmrss: 333mb, ru_maxrss: 341mb (delta=141mb) +2025-08-07T13:54:02Z INFO 49414 [BackendPassManager]: Output has 3 module(s), 3 function(s), 671 memory location(s), 3 block(s), and 1038 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49414 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:02Z INFO 49414 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=671 blocks=3 instructions=1038 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49414 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:02Z INFO 49414 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=192 blocks=1 instructions=40 Max writers: 12 Max Readers: 11 +2025-08-07T13:54:02Z USER 49414 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49414 (sg00) [SubgraphForkPass]: curr_vmrss: 333mb, ru_maxrss: 341mb (delta=0mb) +2025-08-07T13:54:02Z USER 49414 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:02Z USER 49414 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:02Z INFO 49414 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 192 memory location(s), 1 block(s), and 40 instruction(s). Max writers: 12 Max Readers: 11 +2025-08-07T13:54:02Z INFO 49414 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=138 blocks=1 instructions=45 Max writers: 2 Max Readers: 9 +2025-08-07T13:54:02Z USER 49414 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49414 (sg01) [SubgraphForkPass]: curr_vmrss: 333mb, ru_maxrss: 341mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49414 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=341 blocks=1 instructions=953 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49414 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49414 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 138 memory location(s), 1 block(s), and 45 instruction(s). Max writers: 2 Max Readers: 9 +2025-08-07T13:54:02Z INFO 49414 (sg02) [SubgraphForkPass]: curr_vmrss: 333mb, ru_maxrss: 341mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49414 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 341 memory location(s), 1 block(s), and 953 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49414 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:02Z USER 49414 [BackendPassManager]: subgraph_parallel_pass finished after 0.001 seconds +2025-08-07T13:54:02Z INFO 49414 [BackendPassManager]: curr_vmrss: 333mb, ru_maxrss: 341mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49414 [BackendPassManager]: Output has 3 module(s), 3 function(s), 671 memory location(s), 3 block(s), and 1038 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49414 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:02Z INFO 49414 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=671 blocks=3 instructions=1038 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49414 (sg00) [ModuleForkPass]: Running expand_replication +2025-08-07T13:54:02Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=192 blocks=1 instructions=40 Max writers: 12 Max Readers: 11 +2025-08-07T13:54:02Z INFO 49414 (sg00) [ExpandReplication]: Found 0 replicated matmults +2025-08-07T13:54:02Z USER 49414 (sg00) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 333mb, ru_maxrss: 341mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 192 memory location(s), 1 block(s), and 40 instruction(s). Max writers: 12 Max Readers: 11 +2025-08-07T13:54:02Z USER 49414 (sg00) [ModuleForkPass]: Running unroll +2025-08-07T13:54:02Z USER 49414 (sg01) [ModuleForkPass]: Running expand_replication +2025-08-07T13:54:02Z USER 49414 (sg02) [ModuleForkPass]: Running expand_replication +2025-08-07T13:54:02Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=192 blocks=1 instructions=40 Max writers: 12 Max Readers: 11 +2025-08-07T13:54:02Z INFO 49414 (sg00) [Unroll]: INFO (Unroll) Start unrolling at Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=138 blocks=1 instructions=45 Max writers: 2 Max Readers: 9 +2025-08-07T13:54:02Z INFO 49414 (sg01) [ExpandReplication]: Found 0 replicated matmults +2025-08-07T13:54:02Z USER 49414 (sg01) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 333mb, ru_maxrss: 341mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=341 blocks=1 instructions=953 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 138 memory location(s), 1 block(s), and 45 instruction(s). Max writers: 2 Max Readers: 9 +2025-08-07T13:54:02Z USER 49414 (sg01) [ModuleForkPass]: Running unroll +2025-08-07T13:54:02Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=138 blocks=1 instructions=45 Max writers: 2 Max Readers: 9 +2025-08-07T13:54:02Z INFO 49414 (sg02) [ExpandReplication]: Found 0 replicated matmults +2025-08-07T13:54:02Z INFO 49414 (sg01) [Unroll]: INFO (Unroll) Start unrolling at Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:02Z USER 49414 (sg02) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-08-07T13:54:02Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 333mb, ru_maxrss: 341mb (delta=0mb) +2025-08-07T13:54:02Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 341 memory location(s), 1 block(s), and 953 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z USER 49414 (sg02) [ModuleForkPass]: Running unroll +2025-08-07T13:54:02Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=341 blocks=1 instructions=953 Max writers: 191 Max Readers: 475 +2025-08-07T13:54:02Z INFO 49414 (sg02) [Unroll]: INFO (Unroll) Start unrolling at Thu Aug 7 13:54:02 2025 +2025-08-07T13:54:03Z INFO 49414 (sg00) [Unroll]: INFO (Unroll) DONE unrolling Thu Aug 7 13:54:02 2025 + +2025-08-07T13:54:03Z INFO 49414 (sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-08-07T13:54:03Z INFO 49414 (sg00) [Unroll]: Total count: 8384 +2025-08-07T13:54:03Z INFO 49414 (sg00) [Unroll]: Matmult: 4368 +2025-08-07T13:54:03Z INFO 49414 (sg00) [Unroll]: TensorScalarPtr: 986 +2025-08-07T13:54:03Z INFO 49414 (sg00) [Unroll]: GenericCopy: 705 +2025-08-07T13:54:03Z INFO 49414 (sg00) [Unroll]: TensorTensor: 644 +2025-08-07T13:54:03Z INFO 49414 (sg00) [Unroll]: TensorReduce: 448 +2025-08-07T13:54:03Z INFO 49414 (sg00) [Unroll]: Activation: 422 +2025-08-07T13:54:03Z INFO 49414 (sg00) [Unroll]: Memset: 202 +2025-08-07T13:54:03Z INFO 49414 (sg00) [Unroll]: Load: 199 +2025-08-07T13:54:03Z INFO 49414 (sg00) [Unroll]: TensorScalarAffineSelect: 192 +2025-08-07T13:54:03Z INFO 49414 (sg00) [Unroll]: Save: 85 +2025-08-07T13:54:03Z INFO 49414 (sg00) [Unroll]: DMACopy: 82 +2025-08-07T13:54:03Z INFO 49414 (sg00) [Unroll]: Reciprocal: 32 +2025-08-07T13:54:03Z INFO 49414 (sg00) [Unroll]: Iota: 16 +2025-08-07T13:54:03Z INFO 49414 (sg00) [Unroll]: CollectiveCompute: 3 +2025-08-07T13:54:03Z INFO 49414 (sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 80 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: unroll finished after 0.100 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 482mb, ru_maxrss: 482mb (delta=141mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4354 memory location(s), 1 block(s), and 8384 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z INFO 49414 (sg01) [Unroll]: INFO (Unroll) DONE unrolling Thu Aug 7 13:54:02 2025 + +2025-08-07T13:54:03Z INFO 49414 (sg01) [Unroll]: sg0001 Instruction count after Unroll: +2025-08-07T13:54:03Z INFO 49414 (sg01) [Unroll]: Total count: 20003 +2025-08-07T13:54:03Z INFO 49414 (sg01) [Unroll]: Matmult: 14360 +2025-08-07T13:54:03Z INFO 49414 (sg01) [Unroll]: Load: 2012 +2025-08-07T13:54:03Z INFO 49414 (sg01) [Unroll]: GenericCopy: 814 +2025-08-07T13:54:03Z INFO 49414 (sg01) [Unroll]: TensorScalarPtr: 780 +2025-08-07T13:54:03Z INFO 49414 (sg01) [Unroll]: TensorReduce: 576 +2025-08-07T13:54:03Z INFO 49414 (sg01) [Unroll]: Activation: 564 +2025-08-07T13:54:03Z INFO 49414 (sg01) [Unroll]: TensorTensor: 448 +2025-08-07T13:54:03Z INFO 49414 (sg01) [Unroll]: Select: 256 +2025-08-07T13:54:03Z INFO 49414 (sg01) [Unroll]: Save: 81 +2025-08-07T13:54:03Z INFO 49414 (sg01) [Unroll]: DMACopy: 66 +2025-08-07T13:54:03Z INFO 49414 (sg01) [Unroll]: Reciprocal: 32 +2025-08-07T13:54:03Z INFO 49414 (sg01) [Unroll]: Memset: 12 +2025-08-07T13:54:03Z INFO 49414 (sg01) [Unroll]: CollectiveCompute: 2 +2025-08-07T13:54:03Z INFO 49414 (sg01) [Unroll]: Unrolled DGE count with Dynamic AP: 64 +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: unroll finished after 0.240 seconds +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 591mb, ru_maxrss: 591mb (delta=250mb) +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4712 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: INFO (Unroll) DONE unrolling Thu Aug 7 13:54:02 2025 + +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: sg0002 Instruction count after Unroll: +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: Total count: 60307 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: Matmult: 48723 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: GenericCopy: 6298 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: Load: 3075 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: Save: 657 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: TensorTensor: 297 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: TensorScalarPtr: 279 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: Activation: 231 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: Max: 224 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: MaxIndex: 224 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: MatchReplace: 217 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: Gather: 35 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: Memset: 16 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: TensorReduce: 12 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: StreamShuffle: 4 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: Select: 4 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: Iota: 3 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: CollectiveCompute: 3 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: Reciprocal: 3 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: DMACopy: 2 +2025-08-07T13:54:03Z INFO 49414 (sg02) [Unroll]: Unrolled DGE count with Dynamic AP: 1 +2025-08-07T13:54:03Z USER 49414 (sg02) [ModuleForkPass]: unroll finished after 0.629 seconds +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 739mb, ru_maxrss: 739mb (delta=398mb) +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11836 memory location(s), 1 block(s), and 60307 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:03Z USER 49414 [BackendPassManager]: mod_parallel_pass finished after 0.645 seconds +2025-08-07T13:54:03Z INFO 49414 [BackendPassManager]: curr_vmrss: 554mb, ru_maxrss: 739mb (delta=398mb) +2025-08-07T13:54:03Z INFO 49414 [BackendPassManager]: Output has 3 module(s), 3 function(s), 20902 memory location(s), 3 block(s), and 88694 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:03Z INFO 49414 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=20902 blocks=3 instructions=88694 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 (sg00) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:03Z USER 49414 (sg01) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:03Z USER 49414 (sg02) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:03Z INFO 49414 (sg00) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=4354 blocks=1 instructions=8384 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z INFO 49414 (sg01) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=4712 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z INFO 49414 (sg02) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=11836 blocks=1 instructions=60307 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z INFO 49414 (sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:03Z INFO 49414 (sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z USER 49414 (sg00) [SubgraphForkPass]: dead_code_elim finished after 0.013 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [SubgraphForkPass]: curr_vmrss: 557mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 3808 memory location(s), 1 block(s), and 8319 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z INFO 49414 (sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:03Z INFO 49414 (sg01) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg01) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg01) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z USER 49414 (sg01) [SubgraphForkPass]: dead_code_elim finished after 0.051 seconds +2025-08-07T13:54:03Z INFO 49414 (sg01) [SubgraphForkPass]: curr_vmrss: 580mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 4625 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z INFO 49414 (sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:03Z INFO 49414 (sg02) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg02) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg02) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z USER 49414 (sg02) [SubgraphForkPass]: dead_code_elim finished after 0.108 seconds +2025-08-07T13:54:03Z INFO 49414 (sg02) [SubgraphForkPass]: curr_vmrss: 581mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60307 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:03Z USER 49414 [BackendPassManager]: subgraph_parallel_pass finished after 0.111 seconds +2025-08-07T13:54:03Z INFO 49414 [BackendPassManager]: curr_vmrss: 581mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 [BackendPassManager]: Output has 3 module(s), 3 function(s), 20227 memory location(s), 3 block(s), and 88629 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:03Z INFO 49414 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=20227 blocks=3 instructions=88629 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:03Z USER 49414 (sg02) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=3808 blocks=1 instructions=8319 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=4625 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=11794 blocks=1 instructions=60307 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: birverifier finished after 0.009 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 581mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3808 memory location(s), 1 block(s), and 8319 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: birverifier finished after 0.020 seconds +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 581mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4625 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z USER 49414 (sg02) [ModuleForkPass]: birverifier finished after 0.062 seconds +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60307 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:03Z USER 49414 [BackendPassManager]: mod_parallel_pass finished after 0.065 seconds +2025-08-07T13:54:03Z INFO 49414 [BackendPassManager]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 [BackendPassManager]: Output has 3 module(s), 3 function(s), 20227 memory location(s), 3 block(s), and 88629 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:03Z INFO 49414 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=20227 blocks=3 instructions=88629 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:03Z INFO 49414 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=3808 blocks=1 instructions=8319 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-08-07T13:54:03Z USER 49414 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:03Z INFO 49414 (sg00) [SubgraphForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z USER 49414 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:03Z INFO 49414 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 3808 memory location(s), 1 block(s), and 8319 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z INFO 49414 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=4625 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z USER 49414 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:54:03Z INFO 49414 (sg01) [SubgraphForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 4625 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z INFO 49414 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=11794 blocks=1 instructions=60307 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:54:03Z INFO 49414 (sg02) [SubgraphForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60307 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:03Z USER 49414 [BackendPassManager]: subgraph_parallel_pass finished after 0.004 seconds +2025-08-07T13:54:03Z INFO 49414 [BackendPassManager]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 [BackendPassManager]: Output has 3 module(s), 3 function(s), 20227 memory location(s), 3 block(s), and 88629 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:03Z INFO 49414 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=20227 blocks=3 instructions=88629 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running instruction_reorder +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: Running instruction_reorder +2025-08-07T13:54:03Z USER 49414 (sg02) [ModuleForkPass]: Running instruction_reorder +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=3808 blocks=1 instructions=8319 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=4625 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=11794 blocks=1 instructions=60307 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: instruction_reorder finished after 0.002 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3808 memory location(s), 1 block(s), and 8319 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running psum_legalization +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=3808 blocks=1 instructions=8319 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: psum_legalization finished after 0.001 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3808 memory location(s), 1 block(s), and 8319 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=3808 blocks=1 instructions=8319 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.001 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3808 memory location(s), 1 block(s), and 8319 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running pre_opts +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=3808 blocks=1 instructions=8319 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreOpts]: Skipped. No pre-opt passes enabled +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3808 memory location(s), 1 block(s), and 8319 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running error_injector +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=3808 blocks=1 instructions=8319 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z WARNING 49414 (sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3808 memory location(s), 1 block(s), and 8319 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running vn_splitter +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=3808 blocks=1 instructions=8319 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z INFO 49414 (sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-08-07T13:54:03Z INFO 49414 (sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: instruction_reorder finished after 0.006 seconds +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4625 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: Running psum_legalization +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=4625 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: psum_legalization finished after 0.004 seconds +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4625 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: Running legalize_cce_dma +2025-08-07T13:54:03Z INFO 49414 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-08-07T13:54:03Z INFO 49414 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-08-07T13:54:03Z INFO 49414 (sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.003 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.001 seconds +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: vn_splitter finished after 0.007 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=4625 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3808 memory location(s), 1 block(s), and 8319 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running constant_propagate +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=3808 blocks=1 instructions=8319 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z INFO 49414 (sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: legalize_cce_dma finished after 0.003 seconds +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4625 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: Running pre_opts +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=4625 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z INFO 49414 (sg01) [PreOpts]: Skipped. No pre-opt passes enabled +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4625 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: Running error_injector +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=4625 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z WARNING 49414 (sg01) [ErrorInjector]: Unrecognized injected error value "0" +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4625 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: Running vn_splitter +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=4625 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z INFO 49414 (sg01) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 1 +2025-08-07T13:54:03Z INFO 49414 (sg01) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-08-07T13:54:03Z INFO 49414 (sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z USER 49414 (sg02) [ModuleForkPass]: instruction_reorder finished after 0.019 seconds +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 64 +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60307 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 (sg02) [ModuleForkPass]: Running psum_legalization +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=11794 blocks=1 instructions=60307 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z INFO 49414 (sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:03Z INFO 49414 (sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: constant_propagate finished after 0.015 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3744 memory location(s), 1 block(s), and 8255 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running lower_ac +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=3744 blocks=1 instructions=8255 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z INFO 49414 (sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: lower_ac finished after 0.001 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3744 memory location(s), 1 block(s), and 8255 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=3744 blocks=1 instructions=8255 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z INFO 49414 (sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.002 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3744 memory location(s), 1 block(s), and 8255 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running remat_optimization +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=3744 blocks=1 instructions=8255 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z INFO 49414 (sg00) [RematOpt]: Removed 0 remat instructions +2025-08-07T13:54:03Z USER 49414 (sg02) [ModuleForkPass]: psum_legalization finished after 0.012 seconds +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: remat_optimization finished after 0.004 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3744 memory location(s), 1 block(s), and 8255 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running early_peephole_opts +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=3744 blocks=1 instructions=8255 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z INFO 49414 (sg00) [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60307 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 (sg02) [ModuleForkPass]: Running legalize_cce_dma +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=11794 blocks=1 instructions=60307 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z INFO 49414 (sg00) [EarlyPeepholeOpts]: Activation Accumulate: 192 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: early_peephole_opts finished after 0.003 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3744 memory location(s), 1 block(s), and 8255 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=3744 blocks=1 instructions=8255 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3744 memory location(s), 1 block(s), and 8255 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running infer_stream_ids +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=3744 blocks=1 instructions=8255 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3744 memory location(s), 1 block(s), and 8255 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running pre_sched +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=3744 blocks=1 instructions=8255 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49414 [LayerSpiller]: LayerSpill: Start... +2025-08-07T13:54:03Z INFO 49414 [LayerSpiller]: LayerSpill: Found 1 Splits CCs +2025-08-07T13:54:03Z INFO 49414 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-08-07T13:54:03Z INFO 49414 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-08-07T13:54:03Z INFO 49414 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-08-07T13:54:03Z INFO 49414 [LayerSpiller]: LayerSpill: Done. +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: Start split live ranges Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: No split opportunities: +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: End split live ranges Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: Strt remove redundncies Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: remove_redundant_memsets +2025-08-07T13:54:03Z INFO 49414 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-08-07T13:54:03Z INFO 49414 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-08-07T13:54:03Z INFO 49414 (sg01) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-08-07T13:54:03Z INFO 49414 (sg01) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.008 seconds +2025-08-07T13:54:03Z INFO 49414 (sg01) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.009 seconds +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: vn_splitter finished after 0.026 seconds +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4625 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: Running constant_propagate +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=4625 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z USER 49414 (sg02) [ModuleForkPass]: legalize_cce_dma finished after 0.012 seconds +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60307 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 (sg02) [ModuleForkPass]: Running pre_opts +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: remove_redundant_memsets: 0 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: remove_redundant_loads +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=11794 blocks=1 instructions=60307 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z INFO 49414 (sg02) [PreOpts]: Skipped. No pre-opt passes enabled +2025-08-07T13:54:03Z INFO 49414 (sg01) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-08-07T13:54:03Z USER 49414 (sg02) [ModuleForkPass]: pre_opts finished after 0.001 seconds +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 583mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60307 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 (sg02) [ModuleForkPass]: Running error_injector +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: remove_redundant_loads: 0 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: End remove redundncies Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: Start DCE Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=11794 blocks=1 instructions=60307 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z WARNING 49414 (sg02) [ErrorInjector]: Unrecognized injected error value "0" +2025-08-07T13:54:03Z USER 49414 (sg02) [ModuleForkPass]: error_injector finished after 0.001 seconds +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 583mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60307 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 (sg02) [ModuleForkPass]: Running vn_splitter +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=11794 blocks=1 instructions=60307 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z INFO 49414 (sg02) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 14 +2025-08-07T13:54:03Z INFO 49414 (sg02) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: End DCE Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: Start build flow dependencies Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49414 (sg00) [build_flow_deps]: Start build fdeps. Invocation: 1Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49414 (sg00) [build_flow_deps]: Allocs: 3744 instructions: 8255 +2025-08-07T13:54:03Z INFO 49414 (sg00) [build_flow_deps]: Build fdeps inserted 20578 edges +2025-08-07T13:54:03Z INFO 49414 (sg00) [build_flow_deps]: Done build fdeps 20578 Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: End build flow dependencies Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: Start remove useless insts Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: remove_useless_insts +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: remove Useless Instructions: 0 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: End remove useless insts Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: Start scratchpad optimization Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: End scratchpad optimization Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PreSched]: DONE PRE scheduling Thu Aug 7 13:54:03 2025 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: pre_sched finished after 0.046 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 584mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3744 memory location(s), 1 block(s), and 8255 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=3744 blocks=1 instructions=8255 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z INFO 49414 (sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:03Z INFO 49414 (sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:03Z INFO 49414 (sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:03Z INFO 49414 (sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.008 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 584mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3744 memory location(s), 1 block(s), and 8255 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=3744 blocks=1 instructions=8255 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 584mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3745 memory location(s), 1 block(s), and 8255 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=3745 blocks=1 instructions=8255 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 584mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3745 memory location(s), 1 block(s), and 8255 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=3745 blocks=1 instructions=8255 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z INFO 49414 (sg00) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:03Z INFO 49414 (sg00) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: allocating PSUM +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: main loop +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: renumber locations +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: size = 1097 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: build_no_bitmap start +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: found 2324 edges +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: mean: 4.23701 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: median: 3.46069 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: adjacency vectors require 18592 bytes +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: build_no_bitmap done +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: find costs +2025-08-07T13:54:03Z INFO 49414 (sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: simplify interference graph +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: initialize low and high +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: lo = 1097 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: hi = 0 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: inf = 0 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: total = 1097 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: simplify +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: new candidates = 0 +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: select ranges +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: no more spills +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-08-07T13:54:03Z INFO 49414 (sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.016 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 584mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3745 memory location(s), 1 block(s), and 8255 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=3745 blocks=1 instructions=8255 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z INFO 49414 (sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg02) [ShrinkDN]: INFO (ShrinkDN): Shrunk 2 nodes. Total savings 14336 bytes/partition +2025-08-07T13:54:03Z INFO 49414 (sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:03Z INFO 49414 (sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.003 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 584mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3745 memory location(s), 1 block(s), and 8255 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running address_rotation_psum +2025-08-07T13:54:03Z INFO 49414 (sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=3745 blocks=1 instructions=8255 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z INFO 49414 (sg01) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-08-07T13:54:03Z INFO 49414 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 240 PSUM Banks +2025-08-07T13:54:03Z INFO 49414 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 40 PSUM Banks +2025-08-07T13:54:03Z INFO 49414 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 334 PSUM Banks +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: address_rotation_psum finished after 0.014 seconds +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 584mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3745 memory location(s), 1 block(s), and 8255 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z USER 49414 (sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-08-07T13:54:03Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=3745 blocks=1 instructions=8255 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:03Z INFO 49414 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 75576064 +2025-08-07T13:54:03Z INFO 49414 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2996 bytes +2025-08-07T13:54:03Z INFO 49414 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 22544386 +2025-08-07T13:54:03Z INFO 49414 (sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2096 bytes +2025-08-07T13:54:03Z INFO 49414 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 6336512 +2025-08-07T13:54:03Z INFO 49414 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 307 bytes +2025-08-07T13:54:03Z INFO 49414 (sg00) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:03Z INFO 49414 (sg00) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: allocating SB +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: main loop +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: renumber locations +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: size = 2617 +2025-08-07T13:54:03Z INFO 49414 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-08-07T13:54:03Z INFO 49414 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: find partners +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: found 755 accumulation groups +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: largest = custom-call.226.1608_i1 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: tensors = 33 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: requires 66048 bytes/partition +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: expanding partners +2025-08-07T13:54:03Z INFO 49414 (sg02) [VNSplitterPass]: INFO (VNSplitter) Time: 0.001 seconds +2025-08-07T13:54:03Z INFO 49414 (sg02) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.027 seconds +2025-08-07T13:54:03Z INFO 49414 (sg02) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.031 seconds +2025-08-07T13:54:03Z USER 49414 (sg02) [ModuleForkPass]: vn_splitter finished after 0.084 seconds +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 584mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60307 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z USER 49414 (sg02) [ModuleForkPass]: Running constant_propagate +2025-08-07T13:54:03Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=11794 blocks=1 instructions=60307 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:03Z INFO 49414 []: find first defs for local +2025-08-07T13:54:03Z INFO 49414 []: find first defs for global +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: find loads +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: 1 pin count +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: 133 remat count +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: build interference graph +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: pass 1 int-tree +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Num intervals 2617 Num locations 2617 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: info.neighbors init Done +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: edge: 256786 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: mean: 196.245 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: median: 131.431 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: find costs +2025-08-07T13:54:03Z INFO 49414 (sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: simplify interference graph +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: safe = 480 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: unsafe = 1849 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: inf = 287 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: total = 2616 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: simplify +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 1849 #Pinned 0 #Safe 0 minCost 0.000374499 maxCost 0.0895006 locations 2617 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: new candidates = 201 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: select ranges +2025-08-07T13:54:03Z INFO 49414 (sg02) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-08-07T13:54:03Z INFO 49414 (sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Total: 2616 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Spilled: 0.021 (54) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Allocated: 0.979 (2562) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Rover zone: 0.194 (497) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Pre-rover zone: 0.007 (17) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Post-rover zone: 0.799 (2048) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Blocks tall: 1.000 (2562) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.999 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Success +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: SB spills = 54 tensors +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: size = 82432 bytes/partition +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: remats = 1 tensors +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: SB score = 992889 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: best SB heuristic = 0 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: collect spills +2025-08-07T13:54:03Z INFO 49414 (sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z INFO 49414 (sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: constant_propagate finished after 0.132 seconds +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 588mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4625 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: Running lower_ac +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: insert spills +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=4625 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z INFO 49414 (sg01) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: lower_ac finished after 0.004 seconds +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 588mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4625 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: Running input_dma_coalescing +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=4625 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: locationsToDelete done +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: main loop +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: renumber locations +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: size = 3047 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: find partners +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: found 755 accumulation groups +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: largest = custom-call.226.1608_i1 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: tensors = 33 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: requires 66048 bytes/partition +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: expanding partners +2025-08-07T13:54:03Z INFO 49414 (sg01) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: input_dma_coalescing finished after 0.011 seconds +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 589mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 []: find first defs for local +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4625 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: Running remat_optimization +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=4625 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z INFO 49414 []: find first defs for global +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: find loads +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: 1 pin count +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: 531 remat count +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: build interference graph +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: pass 1 int-tree +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Num intervals 3047 Num locations 3047 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: info.neighbors init Done +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: edge: 203902 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: mean: 133.838 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: median: 85.5276 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: find costs +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: simplify interference graph +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: safe = 72 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: unsafe = 12 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: inf = 400 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: total = 484 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: simplify +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 6 #Pinned 0 #Safe 0 minCost 0.017119 maxCost 0.017119 locations 3047 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: new candidates = 2 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: (including 355 infinite cost tensors) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: select ranges +2025-08-07T13:54:03Z INFO 49414 (sg01) [RematOpt]: Removed 0 remat instructions +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: remat_optimization finished after 0.014 seconds +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 589mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4625 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z USER 49414 (sg01) [ModuleForkPass]: Running early_peephole_opts +2025-08-07T13:54:03Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=4625 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:03Z INFO 49414 (sg01) [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Total: 484 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Allocated: 1.000 (484) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Rover zone: 0.593 (287) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Pre-rover zone: 0.021 (10) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Post-rover zone: 0.386 (187) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Blocks tall: 1.000 (484) +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:54:03Z INFO 49414 (sg00) [SB_Allocator]: Success +2025-08-07T13:54:04Z INFO 49414 (sg00) [SB_Allocator]: SB spills = 0 tensors +2025-08-07T13:54:04Z INFO 49414 (sg00) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:04Z INFO 49414 (sg00) [SB_Allocator]: remats = 0 tensors +2025-08-07T13:54:04Z INFO 49414 (sg00) [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:54:04Z INFO 49414 (sg00) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:04Z INFO 49414 (sg00) [SB_Allocator]: SB score = 0 +2025-08-07T13:54:04Z INFO 49414 (sg00) [SB_Allocator]: spilling from SB cost about 992889 cycles +2025-08-07T13:54:04Z INFO 49414 (sg00) [SB_Allocator]: number of tensors spilled from SB = 54 +2025-08-07T13:54:04Z INFO 49414 (sg00) [SB_Allocator]: total size of spilled tensors = 82432 bytes/partition +2025-08-07T13:54:04Z INFO 49414 (sg00) [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-08-07T13:54:04Z INFO 49414 (sg00) [SB_Allocator]: pinning saved approximately 9010 cycles +2025-08-07T13:54:04Z INFO 49414 (sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-08-07T13:54:04Z INFO 49414 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 145240832 +2025-08-07T13:54:04Z INFO 49414 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 1809 bytes +2025-08-07T13:54:04Z INFO 49414 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 40960002 +2025-08-07T13:54:04Z INFO 49414 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 1893 bytes +2025-08-07T13:54:04Z INFO 49414 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 6336512 +2025-08-07T13:54:04Z INFO 49414 (sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 307 bytes +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.084 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 589mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4228 memory location(s), 1 block(s), and 8770 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=4228 blocks=1 instructions=8770 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 (sg01) [EarlyPeepholeOpts]: Activation Accumulate: 256 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: address_rotation_sb finished after 0.006 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 589mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4228 memory location(s), 1 block(s), and 8770 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=4228 blocks=1 instructions=8770 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 186200834, 31.5782% input load, 5.34985% output write, 63.072% spill/reload [sg0000] +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: early_peephole_opts finished after 0.016 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 589mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4625 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=4625 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: removed 0 identical load +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.003 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 589mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4625 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running infer_stream_ids +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=4625 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z INFO 49414 (sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: infer_stream_ids finished after 0.002 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 589mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4625 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running pre_sched +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=4625 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: Start PRE scheduling 2 cores: 1 at: Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 [LayerSpiller]: LayerSpill: Start... +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 1572864, 0.844714% out of total dma traffic(5.87988e+07) +2025-08-07T13:54:04Z INFO 49414 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-08-07T13:54:04Z INFO 49414 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-08-07T13:54:04Z INFO 49414 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-08-07T13:54:04Z INFO 49414 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-08-07T13:54:04Z INFO 49414 [LayerSpiller]: LayerSpill: Done. +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: Start split live ranges Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 237 spill/reload instructions +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 232 spill/reload memory locations +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 20 spill/reload instructions +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 20 spill/reload memory locations +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: Num_Splits: 0 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: End split live ranges Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: Strt remove redundncies Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: remove_redundant_memsets +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [spill optimization round 2]: removed 4 spill/reload instructions +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [spill optimization round 2]: removed 4 spill/reload memory locations +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: remove_redundant_memsets: 0 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: remove_redundant_loads +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [spill optimization round 3]: removed 0 spill/reload instructions +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [spill optimization round 3]: removed 0 spill/reload memory locations +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 38928384, 33.1473% out of total spill/reload dma traffic +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: remove_redundant_loads: 0 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: End remove redundncies Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: Start DCE Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 2 spill/reload instructions +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 1 spill/reload memory locations +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 52 SpillSaves and Reloads +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: average loaded DMA size 2508 bytes +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: average saved DMA size 2061 bytes +2025-08-07T13:54:04Z INFO 49414 (sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 4 SpillSaves and Reloads +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: average loaded DMA size 2532 bytes +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: average saved DMA size 2075 bytes +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 105984768 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 2532 bytes +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 39583746 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2075 bytes +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 3637248, 3.0971% out of total spill/reload dma traffic +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 44138496, 23.7048% out of total dma traffic +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 142062338, 41.3895% input load, 7.01204% output write, 51.5985% spill/reload [sg0000] +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 104166144 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 2488 bytes +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 37896194 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 1986 bytes +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 6336512 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 307 bytes +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 1818 bytes +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.079 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 590mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-08-07T13:54:04Z INFO 49414 (sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: SB Rotation rotated 90 Sb address +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: SB Rotation rotated 80 Sb address +2025-08-07T13:54:04Z INFO 49414 (sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: SB Rotation rotated 31 Sb address +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: End DCE Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg02) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: Start build flow dependencies Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg01) [build_flow_deps]: Start build fdeps. Invocation: 2Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg01) [build_flow_deps]: Allocs: 4625 instructions: 20003 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: SB Rotation rotated 680 Sb address +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: address_rotation_sb finished after 0.057 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 590mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 (sg00) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:04Z INFO 49414 (sg00) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: reserved space = 686899974 bytes +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: spill space = 39321600 bytes +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: aligned spill space = 39321600 bytes +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: renumber locations +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: size = 44 +2025-08-07T13:54:04Z INFO 49414 []: find first defs for local +2025-08-07T13:54:04Z INFO 49414 []: find first defs for global +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: Num intervals 44 Num locations 44 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: simplify interference graph +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: initialize low and high +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: lo = 44 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: hi = 0 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: total = 44 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: simplify +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: select ranges +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: allreduce_dram_hwm 29360128 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: Real CC buffer size 29360128 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: DRAM hwm after allocation: 39321600 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.008 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 591mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running address_rotation_dram +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: DRAM hwm before rotation 39321600 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: allreduce hwm 29360128 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: Real CC buffer size 29360128 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: DRAM hwm after rotation 39321600 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: address_rotation_dram finished after 0.004 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 591mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 (sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-08-07T13:54:04Z INFO 49414 (sg00) [TensorCopyAccel::Impl]: Accelerated 0 out of 843 tensorcopy in Function: sg0000 average acceleration factor: -nan +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.001 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 591mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running peephole_opts +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 (sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: peephole_opts finished after 0.008 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 591mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running lower_kernel +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 (sg00) [LowerKernel]: Started running LowerKernel +2025-08-07T13:54:04Z INFO 49414 (sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 8451, number of allocs: 3899 +2025-08-07T13:54:04Z INFO 49414 (sg00) [LowerKernel]: Scan BKs time (s): 0.062925 +2025-08-07T13:54:04Z INFO 49414 (sg00) [LowerKernel]: Lower BKs time (s): 0.000264 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: lower_kernel finished after 0.001 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 591mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 591mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.001 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 591mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: birverifier finished after 0.006 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 591mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.001 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 591mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 (sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running build_fdeps +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 (sg00) [build_flow_deps]: Start build fdeps. Invocation: 3Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg00) [build_flow_deps]: Allocs: 3899 instructions: 8451 +2025-08-07T13:54:04Z INFO 49414 (sg01) [build_flow_deps]: Build fdeps inserted 61350 edges +2025-08-07T13:54:04Z INFO 49414 (sg01) [build_flow_deps]: Done build fdeps 61350 Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: End build flow dependencies Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: Start remove useless insts Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: remove_useless_insts +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: remove Useless Instructions: 0 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: End remove useless insts Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: Start scratchpad optimization Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: End scratchpad optimization Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg00) [build_flow_deps]: Build fdeps inserted 20910 edges +2025-08-07T13:54:04Z INFO 49414 (sg00) [build_flow_deps]: Done build fdeps 20910 Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: build_fdeps finished after 0.014 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 594mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running remove_redundancies +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 (sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-08-07T13:54:04Z INFO 49414 (sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-08-07T13:54:04Z INFO 49414 (sg00) [RemoveRedundancies]: remove_useless_insts +2025-08-07T13:54:04Z INFO 49414 (sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: remove_redundancies finished after 0.002 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 594mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 (sg00) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:04Z INFO 49414 (sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:04Z INFO 49414 (sg00) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PreSched]: DONE PRE scheduling Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: pre_sched finished after 0.191 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 594mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4625 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=4625 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z INFO 49414 (sg01) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.031 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 594mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 (sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:04Z INFO 49414 (sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:04Z INFO 49414 (sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:04Z INFO 49414 (sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.010 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 594mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 594mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running post_sched +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 [post_scheduler]: Start PosT ScheD 3 sunda Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg01) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:04Z INFO 49414 (sg01) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:04Z INFO 49414 (sg01) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.036 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 594mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4625 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running dynamic_dma_setup +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=4625 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 594mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4626 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running runtime_memory_reservation +2025-08-07T13:54:04Z INFO 49414 (sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=4626 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 594mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4626 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running coloring_allocator_psum +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=4626 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z INFO 49414 (sg01) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:04Z INFO 49414 (sg01) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: allocating PSUM +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: main loop +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: renumber locations +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: size = 1228 +2025-08-07T13:54:04Z INFO 49414 (sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:04Z USER 49414 (sg02) [ModuleForkPass]: constant_propagate finished after 0.333 seconds +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 594mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60307 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:04Z USER 49414 (sg02) [ModuleForkPass]: Running lower_ac +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=11794 blocks=1 instructions=60307 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: build_no_bitmap start +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: found 3574 edges +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: mean: 5.82085 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: median: 6.99819 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: adjacency vectors require 28592 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: build_no_bitmap done +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: find costs +2025-08-07T13:54:04Z INFO 49414 (sg02) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-08-07T13:54:04Z USER 49414 (sg02) [ModuleForkPass]: lower_ac finished after 0.013 seconds +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 594mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60307 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:04Z USER 49414 (sg02) [ModuleForkPass]: Running input_dma_coalescing +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=11794 blocks=1 instructions=60307 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:04Z INFO 49414 [post_scheduler]: Time-aware hwm post-sched +2025-08-07T13:54:04Z INFO 49414 (sg02) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-08-07T13:54:04Z USER 49414 (sg02) [ModuleForkPass]: input_dma_coalescing finished after 0.026 seconds +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 595mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60307 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:04Z USER 49414 (sg02) [ModuleForkPass]: Running remat_optimization +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=11794 blocks=1 instructions=60307 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: simplify interference graph +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: initialize low and high +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: lo = 1228 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: hi = 0 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: inf = 0 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: total = 1228 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: simplify +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: new candidates = 0 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: select ranges +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: no more spills +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-08-07T13:54:04Z INFO 49414 (sg01) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: coloring_allocator_psum finished after 0.067 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 596mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4626 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running dma_optimization_psum +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=4626 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: dma_optimization_psum finished after 0.010 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 596mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4626 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running address_rotation_psum +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=4626 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 32 PSUM Banks +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 16 PSUM Banks +2025-08-07T13:54:04Z INFO 49414 [post_scheduler]: Time-aware simulation time: 2192337 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 2 PSUM Banks +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: address_rotation_psum finished after 0.040 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 598mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4626 memory location(s), 1 block(s), and 20003 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running coloring_allocator_sb +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=4626 blocks=1 instructions=20003 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z INFO 49414 [post_scheduler]: Done PosT ScheD Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: post_sched finished after 0.130 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 598mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 279003648 +2025-08-07T13:54:04Z INFO 49414 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 1083 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 25165826 +2025-08-07T13:54:04Z INFO 49414 (sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2457 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 2129920 +2025-08-07T13:54:04Z INFO 49414 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:04Z INFO 49414 (sg01) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.001 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 598mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: allocating SB +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: main loop +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: renumber locations +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: size = 3365 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: find partners +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: found 1176 accumulation groups +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: largest = _dot.6-t1042_i41 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: tensors = 96 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: requires 98304 bytes/partition +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: expanding partners +2025-08-07T13:54:04Z INFO 49414 (sg02) [RematOpt]: Removed 0 remat instructions +2025-08-07T13:54:04Z USER 49414 (sg02) [ModuleForkPass]: remat_optimization finished after 0.099 seconds +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 603mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60307 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:04Z USER 49414 (sg02) [ModuleForkPass]: Running early_peephole_opts +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=11794 blocks=1 instructions=60307 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:04Z INFO 49414 (sg02) [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 385 PSUM Banks +2025-08-07T13:54:04Z INFO 49414 []: find first defs for local +2025-08-07T13:54:04Z INFO 49414 []: find first defs for global +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 368 PSUM Banks +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: find loads +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: PSUM Rotation rotated 201 PSUM Banks +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: 1 pin count +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: 410 remat count +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: build interference graph +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: pass 1 int-tree +2025-08-07T13:54:04Z INFO 49414 (sg02) [EarlyPeepholeOpts]: Activation Accumulate: 0 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: SB Rotation rotated 11 Sb address +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Num intervals 3365 Num locations 3365 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: info.neighbors init Done +2025-08-07T13:54:04Z USER 49414 (sg02) [ModuleForkPass]: early_peephole_opts finished after 0.029 seconds +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 606mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60307 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:04Z USER 49414 (sg02) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=11794 blocks=1 instructions=60307 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: SB Rotation rotated 10 Sb address +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: edge: 369197 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: mean: 219.434 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: median: 142.66 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: find costs +2025-08-07T13:54:04Z USER 49414 (sg02) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.007 seconds +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 606mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60307 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:04Z USER 49414 (sg02) [ModuleForkPass]: Running infer_stream_ids +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=11794 blocks=1 instructions=60307 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: SB Rotation rotated 93 Sb address +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: simplify interference graph +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:54:04Z USER 49414 (sg02) [ModuleForkPass]: infer_stream_ids finished after 0.007 seconds +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 606mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60307 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:04Z USER 49414 (sg02) [ModuleForkPass]: Running pre_sched +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: safe = 442 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: unsafe = 2024 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: inf = 898 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: total = 3364 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: simplify +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 2013 #Pinned 0 #Safe 0 minCost 0.000602867 maxCost 0.0489795 locations 3365 +2025-08-07T13:54:04Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=11794 blocks=1 instructions=60307 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: Start PRE scheduling 2 cores: 1 at: Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 [LayerSpiller]: LayerSpill: Start... +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: new candidates = 600 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: select ranges +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: SB Rotation rotated 59 Sb address +2025-08-07T13:54:04Z INFO 49414 [LayerSpiller]: LayerSpill: Found 1 Splits CCs +2025-08-07T13:54:04Z INFO 49414 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: SB Rotation rotated 381 Sb address +2025-08-07T13:54:04Z INFO 49414 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-08-07T13:54:04Z INFO 49414 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-08-07T13:54:04Z INFO 49414 [LayerSpiller]: LayerSpill: Done. +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: Start split live ranges Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Total: 3364 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Spilled: 0.043 (144) +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Allocated: 0.957 (3220) +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Rover zone: 0.344 (1107) +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Pre-rover zone: 0.005 (16) +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Post-rover zone: 0.651 (2097) +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Blocks tall: 1.000 (3220) +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Success +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: SB spills = 144 tensors +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: size = 196608 bytes/partition +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: remats = 0 tensors +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: SB score = 1.52973e+06 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: best SB heuristic = 0 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: collect spills +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:04Z INFO 49414 (sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: address_rotation_sb finished after 0.117 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 608mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 (sg00) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:04Z INFO 49414 (sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:04Z INFO 49414 (sg00) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: insert spills +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: locationsToDelete done +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: Num_Splits: 0 +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: End split live ranges Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: Strt remove redundncies Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: remove_redundant_memsets +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: main loop +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: renumber locations +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: size = 3899 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: find partners +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.033 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 613mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 (sg00) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:04Z INFO 49414 (sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-08-07T13:54:04Z INFO 49414 (sg00) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: found 1176 accumulation groups +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: largest = _dot.6-t1042_i41 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: tensors = 96 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: requires 98304 bytes/partition +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: expanding partners +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.005 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 613mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running dep_opt +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 (sg00) [build_flow_deps]: Start build fdeps. Invocation: 4Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg00) [build_flow_deps]: Allocs: 3899 instructions: 8451 +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: remove_redundant_memsets: 2 +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: remove_redundant_loads +2025-08-07T13:54:04Z INFO 49414 (sg00) [build_flow_deps]: Build fdeps inserted 20242 edges +2025-08-07T13:54:04Z INFO 49414 (sg00) [build_flow_deps]: Done build fdeps 20242 Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: dep_opt finished after 0.027 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 613mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: Running report_stats +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 (sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌──────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 17 │ 9957281792 │ +│ DMACopy │ Internal -> ExternalOutput │ 64 │ 67108864 │ +│ DMACopy │ Internal -> Output │ 1 │ 16777216 │ +│ Load │ Const -> Internal │ 3 │ 65792 │ +│ Load │ ExternalInput -> Internal │ 148 │ 58733056 │ +│ Load │ Internal │ 178 │ 45367296 │ +│ Save │ Internal │ 62 │ 16252928 │ +│ Save │ Internal -> Output │ 37 │ 9961474 │ +│ Save (Spill) │ Internal │ 51 │ 11681792 │ +└──────────────┴────────────────────────────┴───────┴────────────┘ + +2025-08-07T13:54:04Z INFO 49414 (sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 3 │ +│ 64 │ 1 │ +│ 256 │ 3 │ +│ 512 │ 1 │ +│ 896 │ 6 │ +│ 1024 │ 46 │ +│ 1920 │ 32 │ +│ 2048 │ 288 │ +│ 4096 │ 116 │ +│ 262144 │ 64 │ +│ 8388608 │ 2 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:04Z INFO 49414 (sg00) [ReportStats]: MM Stats: #MatMults 4368 #MatMult-Transposes 1312 +2025-08-07T13:54:04Z INFO 49414 (sg00) [ReportStats]: IO Tensor size combined: 668484100 +2025-08-07T13:54:04Z INFO 49414 (sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input76 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input77 │ ExternalInput │ bfloat16 │ 16777216 │ +│ input83 │ ExternalInput │ bfloat16 │ 16777216 │ +│ input78 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input81 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input5 │ ExternalInput │ bfloat16 │ 1048576 │ +│ input4 │ ExternalInput │ bfloat16 │ 1048576 │ +│ output1 │ ExternalOutput │ bfloat16 │ 1048576 │ +│ output2 │ ExternalOutput │ bfloat16 │ 1048576 │ +│ input79 │ ExternalInput │ bfloat16 │ 8192 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-08-07T13:54:04Z INFO 49414 (sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate1 │ Output │ bfloat16 │ 8388608 │ +│ intermediate4-buffer-2240 │ Internal │ bfloat16 │ 8388608 │ +│ intermediate4 │ Output │ bfloat16 │ 8388608 │ +│ dot.4-buffer-2238 │ Internal │ bfloat16 │ 8388608 │ +│ all_gather.1_i0 │ Internal │ bfloat16 │ 4194304 │ +│ all_gather.1_i1 │ Internal │ bfloat16 │ 4194304 │ +│ transpose.1_i1 │ Internal │ bfloat16 │ 2097152 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ transpose.1_i0 │ Internal │ bfloat16 │ 2097152 │ +│ intermediate0 │ Output │ uint8 │ 1048576 │ +└───────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-08-07T13:54:04Z USER 49414 (sg00) [ModuleForkPass]: report_stats finished after 0.002 seconds +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 613mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:04Z INFO 49414 []: find first defs for local +2025-08-07T13:54:04Z INFO 49414 []: find first defs for global +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: remove_redundant_loads: 0 +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: End remove redundncies Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: Start DCE Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: find loads +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: 1 pin count +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: 880 remat count +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: build interference graph +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: pass 1 int-tree +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Num intervals 3899 Num locations 3899 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: info.neighbors init Done +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: edge: 309665 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: mean: 158.843 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: median: 113.474 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: find costs +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: simplify interference graph +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: safe = 257 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: unsafe = 12 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: inf = 409 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: total = 678 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: simplify +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 10 #Pinned 0 #Safe 0 minCost 0.012868 maxCost 0.0320175 locations 3899 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: new candidates = 9 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: (including 409 infinite cost tensors) +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: select ranges +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Total: 678 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Spilled: 0.000 (0) +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Allocated: 1.000 (678) +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Rover zone: 0.618 (419) +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Pre-rover zone: 0.013 (9) +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Post-rover zone: 0.369 (250) +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Blocks tall: 1.000 (678) +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: Success +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: SB spills = 0 tensors +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: remats = 0 tensors +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: SB score = 0 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: spilling from SB cost about 1.52973e+06 cycles +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: number of tensors spilled from SB = 144 +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: total size of spilled tensors = 196608 bytes/partition +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: pinning saved approximately 9010 cycles +2025-08-07T13:54:04Z INFO 49414 (sg01) [SB_Allocator]: 0% SB utilization after allocation +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:04Z INFO 49414 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 380453376 +2025-08-07T13:54:04Z INFO 49414 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 1167 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 77594626 +2025-08-07T13:54:04Z INFO 49414 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2104 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 2129920 +2025-08-07T13:54:04Z INFO 49414 (sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: coloring_allocator_sb finished after 0.249 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 613mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5304 memory location(s), 1 block(s), and 20745 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=5304 blocks=1 instructions=20745 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: address_rotation_sb finished after 0.014 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 613mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5304 memory location(s), 1 block(s), and 20745 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running dma_optimization_sb +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=5304 blocks=1 instructions=20745 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 458048002, 57.2487% input load, 1.83138% output write, 40.9199% spill/reload [sg0001] +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: removed 0 identical load +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: sub-graph will get execute 35 times +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(2.62226e+08) +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 356 spill/reload instructions +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 312 spill/reload memory locations +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: End DCE Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [spill optimization round 1]: removed 40 spill/reload instructions +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [spill optimization round 1]: removed 34 spill/reload memory locations +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [spill optimization round 2]: removed 0 spill/reload instructions +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [spill optimization round 2]: removed 0 spill/reload memory locations +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 93585408, 49.9301% out of total spill/reload dma traffic +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: Start build flow dependencies Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg02) [build_flow_deps]: Start build fdeps. Invocation: 5Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-08-07T13:54:04Z INFO 49414 (sg02) [build_flow_deps]: Allocs: 11794 instructions: 60305 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 228 SpillSaves and Reloads +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: average loaded DMA size 1133 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: average saved DMA size 2302 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 88 SpillSaves and Reloads +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: average loaded DMA size 1144 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: average saved DMA size 2621 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 2 combined 0 SpillSaves and Reloads +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: average loaded DMA size 1144 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: average saved DMA size 2621 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 311116288 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 1144 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 53346306 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2621 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 1441792, 0.769231% out of total spill/reload dma traffic +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 95027200, 20.7461% out of total dma traffic +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 363020802, 72.2345% input load, 2.31078% output write, 25.4547% spill/reload [sg0001] +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 310395392 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 1145 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 52625410 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2678 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 2129920 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 130 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 1188 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: dma_optimization_sb finished after 0.210 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20193 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=4722 blocks=1 instructions=20193 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: SB Rotation rotated 35 Sb address +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: SB Rotation rotated 214 Sb address +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: SB Rotation rotated 53 Sb address +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: SB Rotation rotated 26 Sb address +2025-08-07T13:54:04Z INFO 49414 (sg02) [build_flow_deps]: Build fdeps inserted 207193 edges +2025-08-07T13:54:04Z INFO 49414 (sg02) [build_flow_deps]: Done build fdeps 207193 Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: End build flow dependencies Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: Start remove useless insts Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: remove_useless_insts +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: remove Useless Instructions: 0 +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: End remove useless insts Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: Start scratchpad optimization Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: SB Rotation rotated 372 Sb address +2025-08-07T13:54:04Z INFO 49414 (sg02) [PreSched]: End scratchpad optimization Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: address_rotation_sb finished after 0.100 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 619mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20193 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running coloring_allocator_dram +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=4722 blocks=1 instructions=20193 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z INFO 49414 (sg01) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:04Z INFO 49414 (sg01) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: reserved space = 232342024 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: spill space = 67108864 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: aligned spill space = 67108864 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: renumber locations +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: size = 70 +2025-08-07T13:54:04Z INFO 49414 []: find first defs for local +2025-08-07T13:54:04Z INFO 49414 []: find first defs for global +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: Num intervals 70 Num locations 70 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: simplify interference graph +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: initialize low and high +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: lo = 70 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: hi = 0 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: total = 70 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: simplify +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: select ranges +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: allreduce_dram_hwm 33554432 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: Real CC buffer size 33554432 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: DRAM hwm after allocation: 55443456 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: coloring_allocator_dram finished after 0.020 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20193 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running address_rotation_dram +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=4722 blocks=1 instructions=20193 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: Runtime page size at 512MB +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: DRAM hwm before rotation 55443456 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: allreduce hwm 33554432 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: Real CC buffer size 33554432 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: DRAM hwm after rotation 55443456 +2025-08-07T13:54:04Z INFO 49414 (sg01) [DMAOptimizationBase]: DRAM Rotation rotated 4 Dram address +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: address_rotation_dram finished after 0.008 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20193 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running tensorcopy_accel +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=4722 blocks=1 instructions=20193 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z INFO 49414 (sg01) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-08-07T13:54:04Z INFO 49414 (sg01) [TensorCopyAccel::Impl]: Accelerated 32 out of 826 tensorcopy in Function: sg0001 average acceleration factor: 1 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: tensorcopy_accel finished after 0.001 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20193 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running peephole_opts +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=4722 blocks=1 instructions=20193 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z INFO 49414 (sg01) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: peephole_opts finished after 0.008 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20449 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running lower_kernel +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=4722 blocks=1 instructions=20449 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z INFO 49414 (sg01) [LowerKernel]: Started running LowerKernel +2025-08-07T13:54:04Z INFO 49414 (sg01) [LowerKernel]: Start of kernel lowering pass, number of insts: 20449, number of allocs: 4722 +2025-08-07T13:54:04Z INFO 49414 (sg01) [LowerKernel]: Scan BKs time (s): 0.001711 +2025-08-07T13:54:04Z INFO 49414 (sg01) [LowerKernel]: Lower BKs time (s): 4e-06 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: lower_kernel finished after 0.001 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20449 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=4722 blocks=1 instructions=20449 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20449 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=4722 blocks=1 instructions=20449 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.002 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20449 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=4722 blocks=1 instructions=20449 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: birverifier finished after 0.012 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20449 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running dynamic_dma_scan +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=4722 blocks=1 instructions=20449 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: dynamic_dma_scan finished after 0.002 seconds +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20449 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z USER 49414 (sg01) [ModuleForkPass]: Running build_fdeps +2025-08-07T13:54:04Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=4722 blocks=1 instructions=20449 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:04Z INFO 49414 (sg01) [build_flow_deps]: Start build fdeps. Invocation: 6Thu Aug 7 13:54:04 2025 +2025-08-07T13:54:04Z INFO 49414 (sg01) [build_flow_deps]: Allocs: 4722 instructions: 20449 +2025-08-07T13:54:05Z INFO 49414 (sg02) [PreSched]: DONE PRE scheduling Thu Aug 7 13:54:05 2025 +2025-08-07T13:54:05Z USER 49414 (sg02) [ModuleForkPass]: pre_sched finished after 0.556 seconds +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60305 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:05Z USER 49414 (sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=11794 blocks=1 instructions=60305 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:05Z INFO 49414 (sg02) [TensorCopyElim]: Tensor CP elimination: 1 +2025-08-07T13:54:05Z INFO 49414 (sg01) [build_flow_deps]: Build fdeps inserted 62004 edges +2025-08-07T13:54:05Z INFO 49414 (sg01) [build_flow_deps]: Done build fdeps 62004 Thu Aug 7 13:54:05 2025 +2025-08-07T13:54:05Z USER 49414 (sg01) [ModuleForkPass]: build_fdeps finished after 0.051 seconds +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20449 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:05Z USER 49414 (sg01) [ModuleForkPass]: Running remove_redundancies +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=4722 blocks=1 instructions=20449 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:05Z INFO 49414 (sg01) [RemoveRedundancies]: remove_clobbered_writes +2025-08-07T13:54:05Z INFO 49414 (sg01) [RemoveRedundancies]: remove_clobbered_writes: 11 +2025-08-07T13:54:05Z INFO 49414 (sg01) [RemoveRedundancies]: remove_useless_insts +2025-08-07T13:54:05Z INFO 49414 (sg01) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-08-07T13:54:05Z USER 49414 (sg01) [ModuleForkPass]: remove_redundancies finished after 0.008 seconds +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20438 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:05Z USER 49414 (sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=4722 blocks=1 instructions=20438 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:05Z INFO 49414 (sg01) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:05Z INFO 49414 (sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:05Z INFO 49414 (sg01) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:05Z INFO 49414 (sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:05Z INFO 49414 (sg02) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:05Z INFO 49414 (sg02) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:54:05Z INFO 49414 (sg02) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:54:05Z USER 49414 (sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.126 seconds +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11793 memory location(s), 1 block(s), and 60304 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:05Z USER 49414 (sg02) [ModuleForkPass]: Running dynamic_dma_setup +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=11793 blocks=1 instructions=60304 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:05Z USER 49414 (sg02) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60304 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:05Z USER 49414 (sg02) [ModuleForkPass]: Running runtime_memory_reservation +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=11794 blocks=1 instructions=60304 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:05Z USER 49414 (sg02) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60304 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:05Z USER 49414 (sg02) [ModuleForkPass]: Running coloring_allocator_psum +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=11794 blocks=1 instructions=60304 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:05Z INFO 49414 (sg02) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:05Z INFO 49414 (sg02) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: allocating PSUM +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: main loop +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: renumber locations +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: size = 6399 +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: build_no_bitmap start +2025-08-07T13:54:05Z USER 49414 (sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.147 seconds +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 623mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20438 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:05Z USER 49414 (sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=4722 blocks=1 instructions=20438 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: found 17772 edges +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: mean: 5.55462 +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: median: 6.99908 +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: adjacency vectors require 142176 bytes +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: build_no_bitmap done +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: find costs +2025-08-07T13:54:05Z INFO 49414 (sg01) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:05Z INFO 49414 (sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:05Z USER 49414 (sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.048 seconds +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 623mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20438 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:05Z USER 49414 (sg01) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=4722 blocks=1 instructions=20438 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:05Z USER 49414 (sg01) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 623mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20438 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:05Z USER 49414 (sg01) [ModuleForkPass]: Running post_sched +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=4722 blocks=1 instructions=20438 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:05Z INFO 49414 [post_scheduler]: Start PosT ScheD 3 sunda Thu Aug 7 13:54:05 2025 +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: simplify interference graph +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: initialize low and high +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: lo = 6397 +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: hi = 2 +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: inf = 0 +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: total = 6399 +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: simplify +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: new candidates = 0 +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: select ranges +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: no more spills +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-08-07T13:54:05Z INFO 49414 (sg02) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-08-07T13:54:05Z USER 49414 (sg02) [ModuleForkPass]: coloring_allocator_psum finished after 0.211 seconds +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 623mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60304 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:05Z USER 49414 (sg02) [ModuleForkPass]: Running dma_optimization_psum +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=11794 blocks=1 instructions=60304 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:05Z INFO 49414 (sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:05Z INFO 49414 (sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:05Z USER 49414 (sg02) [ModuleForkPass]: dma_optimization_psum finished after 0.038 seconds +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 623mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60304 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:05Z USER 49414 (sg02) [ModuleForkPass]: Running address_rotation_psum +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=11794 blocks=1 instructions=60304 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:05Z INFO 49414 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-08-07T13:54:05Z INFO 49414 [post_scheduler]: Time-aware hwm post-sched +2025-08-07T13:54:05Z INFO 49414 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 1 PSUM Banks +2025-08-07T13:54:05Z INFO 49414 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-08-07T13:54:05Z USER 49414 (sg02) [ModuleForkPass]: address_rotation_psum finished after 0.235 seconds +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 631mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11794 memory location(s), 1 block(s), and 60304 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:05Z USER 49414 (sg02) [ModuleForkPass]: Running coloring_allocator_sb +2025-08-07T13:54:05Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=11794 blocks=1 instructions=60304 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:05Z INFO 49414 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 840812318 +2025-08-07T13:54:05Z INFO 49414 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2154 bytes +2025-08-07T13:54:05Z INFO 49414 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 17094410 +2025-08-07T13:54:05Z INFO 49414 (sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2439 bytes +2025-08-07T13:54:05Z INFO 49414 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 8196 +2025-08-07T13:54:05Z INFO 49414 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 248 bytes +2025-08-07T13:54:05Z INFO 49414 (sg02) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:05Z INFO 49414 (sg02) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: allocating SB +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: main loop +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: renumber locations +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: size = 5357 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: find partners +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: found 6393 accumulation groups +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: largest = _dot.256-t864_i7 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: tensors = 96 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: requires 98304 bytes/partition +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: expanding partners +2025-08-07T13:54:05Z INFO 49414 []: find first defs for local +2025-08-07T13:54:05Z INFO 49414 [post_scheduler]: Time-aware simulation time: 165729830 +2025-08-07T13:54:05Z INFO 49414 []: find first defs for global +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: find loads +2025-08-07T13:54:05Z INFO 49414 [post_scheduler]: Done PosT ScheD Thu Aug 7 13:54:05 2025 +2025-08-07T13:54:05Z USER 49414 (sg01) [ModuleForkPass]: post_sched finished after 0.504 seconds +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 635mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20438 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:05Z USER 49414 (sg01) [ModuleForkPass]: Running expand_scheduling_units +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=4722 blocks=1 instructions=20438 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:05Z USER 49414 (sg01) [ModuleForkPass]: expand_scheduling_units finished after 0.002 seconds +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 635mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20438 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:05Z USER 49414 (sg01) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: 1 pin count +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: 1525 remat count +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: build interference graph +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: pass 1 int-tree +2025-08-07T13:54:05Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=4722 blocks=1 instructions=20438 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: Num intervals 5357 Num locations 5357 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: info.neighbors init Done +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: edge: 191190 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: mean: 71.3795 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: median: 57.9004 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: find costs +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: simplify interference graph +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: safe = 3897 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: unsafe = 652 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: inf = 807 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: total = 5356 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: simplify +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 630 #Pinned 0 #Safe 0 minCost 0.00172952 maxCost 0.723851 locations 5357 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: new candidates = 324 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: select ranges +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: Total: 5356 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: Spilled: 0.020 (105) +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: Allocated: 0.980 (5251) +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: Rover zone: 0.865 (4540) +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: Pre-rover zone: 0.004 (22) +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: Post-rover zone: 0.130 (685) +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: Slice zone: 0.001 (4) +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: Blocks nothing: 0.039 (205) +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: Blocks medium: 0.002 (11) +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.645 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: Visited until medium blocking (median): 0.711 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.731 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: Blocks tall: 0.959 (5035) +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.862 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: Visited until tall blocking (median): 0.993 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: Success +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: SB spills = 105 tensors +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: size = 155656 bytes/partition +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: remats = 0 tensors +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: SB score = 974301 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: best SB heuristic = 0 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: collect spills +2025-08-07T13:54:05Z INFO 49414 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 524 PSUM Banks +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: insert spills +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: locationsToDelete done +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: main loop +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: renumber locations +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: size = 5639 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: find partners +2025-08-07T13:54:05Z INFO 49414 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 514 PSUM Banks +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: found 6393 accumulation groups +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: largest = _dot.256-t864_i7 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: tensors = 96 +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: requires 98304 bytes/partition +2025-08-07T13:54:05Z INFO 49414 (sg02) [SB_Allocator]: expanding partners +2025-08-07T13:54:05Z INFO 49414 (sg01) [DMAOptimizationBase]: PSUM Rotation rotated 17 PSUM Banks +2025-08-07T13:54:05Z INFO 49414 (sg01) [DMAOptimizationBase]: SB Rotation rotated 12 Sb address +2025-08-07T13:54:05Z INFO 49414 []: find first defs for local +2025-08-07T13:54:06Z INFO 49414 []: find first defs for global +2025-08-07T13:54:06Z INFO 49414 (sg01) [DMAOptimizationBase]: SB Rotation rotated 233 Sb address +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: find loads +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: 1 pin count +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: 1759 remat count +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: build interference graph +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: pass 1 int-tree +2025-08-07T13:54:06Z INFO 49414 (sg01) [DMAOptimizationBase]: SB Rotation rotated 30 Sb address +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: Num intervals 5639 Num locations 5639 +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: info.neighbors init Done +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:54:06Z INFO 49414 (sg01) [DMAOptimizationBase]: SB Rotation rotated 38 Sb address +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: edge: 157145 +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: mean: 55.7351 +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: median: 51.4479 +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: find costs +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: simplify interference graph +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: safe = 111 +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: unsafe = 46 +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: inf = 230 +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: total = 387 +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: simplify +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 44 #Pinned 0 #Safe 0 minCost 0.00408226 maxCost 0.0326828 locations 5639 +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: new candidates = 43 +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: (including 230 infinite cost tensors) +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: select ranges +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: Total: 387 +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: Spilled: 0.000 (0) +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: Allocated: 1.000 (387) +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: Rover zone: 0.488 (189) +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: Pre-rover zone: 0.008 (3) +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: Post-rover zone: 0.504 (195) +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: Slice zone: 0.000 (0) +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: Blocks tall: 1.000 (387) +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:54:06Z INFO 49414 (sg02) [SB_Allocator]: Success +2025-08-07T13:54:06Z INFO 49414 (sg01) [DMAOptimizationBase]: SB Rotation rotated 302 Sb address +2025-08-07T13:54:06Z INFO 49414 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:06Z INFO 49414 (sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:06Z USER 49414 (sg01) [ModuleForkPass]: address_rotation_sb finished after 0.378 seconds +2025-08-07T13:54:06Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 640mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:06Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20438 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:06Z USER 49414 (sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:06Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=4722 blocks=1 instructions=20438 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:06Z INFO 49414 (sg01) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:06Z INFO 49414 (sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:06Z INFO 49414 (sg01) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:06Z USER 49414 (sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.098 seconds +2025-08-07T13:54:06Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 644mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:06Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20438 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:06Z USER 49414 (sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:06Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=4722 blocks=1 instructions=20438 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:06Z INFO 49414 (sg01) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:06Z INFO 49414 (sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-08-07T13:54:06Z INFO 49414 (sg01) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:06Z USER 49414 (sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.013 seconds +2025-08-07T13:54:06Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 644mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:06Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20438 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:06Z USER 49414 (sg01) [ModuleForkPass]: Running dep_opt +2025-08-07T13:54:06Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=4722 blocks=1 instructions=20438 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:06Z INFO 49414 (sg01) [build_flow_deps]: Start build fdeps. Invocation: 7Thu Aug 7 13:54:06 2025 +2025-08-07T13:54:06Z INFO 49414 (sg01) [build_flow_deps]: Allocs: 4722 instructions: 20438 +2025-08-07T13:54:06Z INFO 49414 (sg01) [build_flow_deps]: Build fdeps inserted 61513 edges +2025-08-07T13:54:06Z INFO 49414 (sg01) [build_flow_deps]: Done build fdeps 61513 Thu Aug 7 13:54:06 2025 +2025-08-07T13:54:06Z USER 49414 (sg01) [ModuleForkPass]: dep_opt finished after 0.063 seconds +2025-08-07T13:54:06Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 644mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:06Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20438 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:06Z USER 49414 (sg01) [ModuleForkPass]: Running report_stats +2025-08-07T13:54:06Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=4722 blocks=1 instructions=20438 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:06Z INFO 49414 (sg01) [ReportStats]: Data Movement Statistics: sg0001 +┌──────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 25165824 │ +│ DMACopy │ Internal -> ExternalOutput │ 64 │ 67108864 │ +│ DMACopy │ Internal -> Output │ 1 │ 16777216 │ +│ Load │ Const -> Internal │ 2 │ 65536 │ +│ Load │ ExternalInput -> Internal │ 1972 │ 260063744 │ +│ Load │ Input -> Internal │ 6 │ 2097152 │ +│ Load │ Internal │ 132 │ 47448064 │ +│ Save │ Internal │ 99 │ 31195136 │ +│ Save │ Internal -> Output │ 17 │ 8388610 │ +│ Save (Spill) │ Internal │ 44 │ 13041664 │ +└──────────────┴────────────────────────────┴───────┴───────────┘ + +2025-08-07T13:54:06Z INFO 49414 (sg01) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 3 │ +│ 64 │ 2 │ +│ 256 │ 1538 │ +│ 1024 │ 73 │ +│ 2048 │ 156 │ +│ 4096 │ 500 │ +│ 262144 │ 64 │ +│ 8388608 │ 5 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:06Z INFO 49414 (sg01) [ReportStats]: MM Stats: #MatMults 14360 #MatMult-Transposes 1904 +2025-08-07T13:54:06Z INFO 49414 (sg01) [ReportStats]: IO Tensor size combined: 197149188 +2025-08-07T13:54:06Z INFO 49414 (sg01) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input87 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input84 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input85 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input88 │ ExternalInput │ bfloat16 │ 16777216 │ +│ input94 │ ExternalInput │ bfloat16 │ 16777216 │ +│ input92 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input89 │ ExternalInput │ bfloat16 │ 4194304 │ +│ output4 │ ExternalOutput │ bfloat16 │ 1048576 │ +│ input7 │ ExternalInput │ bfloat16 │ 1048576 │ +│ input6 │ ExternalInput │ bfloat16 │ 1048576 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-08-07T13:54:06Z INFO 49414 (sg01) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate1 │ Input │ bfloat16 │ 8388608 │ +│ add.4 │ Internal │ bfloat16 │ 8388608 │ +│ dot.7-buffer-1826 │ Internal │ bfloat16 │ 8388608 │ +│ intermediate6 │ Output │ bfloat16 │ 8388608 │ +│ intermediate4 │ Input │ bfloat16 │ 8388608 │ +│ intermediate7 │ Output │ bfloat16 │ 8388608 │ +│ all_reduce.1-buffer-1828 │ Internal │ bfloat16 │ 8388608 │ +│ intermediate7-buffer-1833 │ Internal │ bfloat16 │ 8388608 │ +│ dot.11-buffer-1831 │ Internal │ bfloat16 │ 8388608 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +└───────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-08-07T13:54:06Z USER 49414 (sg01) [ModuleForkPass]: report_stats finished after 0.004 seconds +2025-08-07T13:54:06Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 644mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:06Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20438 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:30Z INFO 49414 (sg02) [SB_Allocator]: SB spills = 0 tensors +2025-08-07T13:54:30Z INFO 49414 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:30Z INFO 49414 (sg02) [SB_Allocator]: remats = 0 tensors +2025-08-07T13:54:30Z INFO 49414 (sg02) [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:54:30Z INFO 49414 (sg02) [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:54:30Z INFO 49414 (sg02) [SB_Allocator]: SB score = 0 +2025-08-07T13:54:30Z INFO 49414 (sg02) [SB_Allocator]: spilling from SB cost about 974301 cycles +2025-08-07T13:54:30Z INFO 49414 (sg02) [SB_Allocator]: number of tensors spilled from SB = 105 +2025-08-07T13:54:30Z INFO 49414 (sg02) [SB_Allocator]: total size of spilled tensors = 155656 bytes/partition +2025-08-07T13:54:30Z INFO 49414 (sg02) [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-08-07T13:54:30Z INFO 49414 (sg02) [SB_Allocator]: pinning saved approximately 9010 cycles +2025-08-07T13:54:30Z INFO 49414 (sg02) [SB_Allocator]: 0% SB utilization after allocation +2025-08-07T13:54:30Z INFO 49414 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 907008030 +2025-08-07T13:54:30Z INFO 49414 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2127 bytes +2025-08-07T13:54:30Z INFO 49414 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 62183434 +2025-08-07T13:54:30Z INFO 49414 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2338 bytes +2025-08-07T13:54:30Z INFO 49414 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 8196 +2025-08-07T13:54:30Z INFO 49414 (sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 248 bytes +2025-08-07T13:54:30Z USER 49414 (sg02) [ModuleForkPass]: coloring_allocator_sb finished after 25.106 seconds +2025-08-07T13:54:30Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 644mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:30Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12181 memory location(s), 1 block(s), and 60739 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:30Z USER 49414 (sg02) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:30Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=12181 blocks=1 instructions=60739 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:30Z INFO 49414 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:30Z USER 49414 (sg02) [ModuleForkPass]: address_rotation_sb finished after 0.081 seconds +2025-08-07T13:54:30Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 644mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:30Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12181 memory location(s), 1 block(s), and 60739 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:30Z USER 49414 (sg02) [ModuleForkPass]: Running dma_optimization_sb +2025-08-07T13:54:30Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=12181 blocks=1 instructions=60739 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:30Z INFO 49414 (sg02) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 969191464, 84.99% input load, 4.12715e-07% output write, 15.01% spill/reload [sg0002] +2025-08-07T13:54:30Z INFO 49414 (sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:30Z INFO 49414 (sg02) [DMAOptimizationBase]: removed 0 identical load +2025-08-07T13:54:30Z INFO 49414 (sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:30Z INFO 49414 (sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:54:30Z INFO 49414 (sg02) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-08-07T13:54:30Z INFO 49414 (sg02) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-08-07T13:54:30Z INFO 49414 (sg02) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-08-07T13:54:30Z INFO 49414 (sg02) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 4096, 0.00042262% out of total dma traffic(8.23716e+08) +2025-08-07T13:54:30Z INFO 49414 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 164 spill/reload instructions +2025-08-07T13:54:30Z INFO 49414 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 127 spill/reload memory locations +2025-08-07T13:54:30Z INFO 49414 (sg02) [DMAOptimizationBase]: [spill optimization round 1]: removed 15 spill/reload instructions +2025-08-07T13:54:30Z INFO 49414 (sg02) [DMAOptimizationBase]: [spill optimization round 1]: removed 8 spill/reload memory locations +2025-08-07T13:54:30Z INFO 49414 (sg02) [DMAOptimizationBase]: [spill optimization round 2]: removed 0 spill/reload instructions +2025-08-07T13:54:30Z INFO 49414 (sg02) [DMAOptimizationBase]: [spill optimization round 2]: removed 0 spill/reload memory locations +2025-08-07T13:54:30Z INFO 49414 (sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 62526724, 42.9809% out of total spill/reload dma traffic +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 13 spill/reload instructions +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 1 spill/reload memory locations +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 176 SpillSaves and Reloads +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: average loaded DMA size 2165 bytes +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: average saved DMA size 2573 bytes +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 88 SpillSaves and Reloads +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: average loaded DMA size 2186 bytes +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: average saved DMA size 3159 bytes +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 2 combined 0 SpillSaves and Reloads +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: average loaded DMA size 2186 bytes +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: average saved DMA size 3159 bytes +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 867544860 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 2186 bytes +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 39112456 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 3159 bytes +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 3328, 0.00228767% out of total spill/reload dma traffic +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 62534148, 6.4522% out of total dma traffic +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 906657316, 90.8515% input load, 4.41181e-07% output write, 9.1485% spill/reload [sg0002] +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 867544860 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 2186 bytes +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 39112456 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 3159 bytes +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 8196 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 248 bytes +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 2215 bytes +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: dma_optimization_sb finished after 0.519 seconds +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 648mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60402 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=11821 blocks=1 instructions=60402 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: SB Rotation rotated 320 Sb address +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: SB Rotation rotated 783 Sb address +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: SB Rotation rotated 229 Sb address +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: SB Rotation rotated 12 Sb address +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: SB Rotation rotated 243 Sb address +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: address_rotation_sb finished after 0.344 seconds +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 648mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60402 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: Running coloring_allocator_dram +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=11821 blocks=1 instructions=60402 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z INFO 49414 (sg02) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:31Z INFO 49414 (sg02) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: reserved space = 790157338 bytes +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: spill space = 53796612 bytes +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: aligned spill space = 53837824 bytes +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: renumber locations +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: size = 56 +2025-08-07T13:54:31Z INFO 49414 []: find first defs for local +2025-08-07T13:54:31Z INFO 49414 []: find first defs for global +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: Num intervals 56 Num locations 56 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: simplify interference graph +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: initialize low and high +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: lo = 56 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: hi = 0 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: total = 56 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: simplify +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: select ranges +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: allreduce_dram_hwm 16793600 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: Real CC buffer size 16793600 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: DRAM hwm after allocation: 38813696 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: coloring_allocator_dram finished after 0.087 seconds +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 648mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60402 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: Running address_rotation_dram +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=11821 blocks=1 instructions=60402 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: Runtime page size at 512MB +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: DRAM hwm before rotation 38813696 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: allreduce hwm 16793600 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: Real CC buffer size 16793600 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: DRAM hwm after rotation 38813696 +2025-08-07T13:54:31Z INFO 49414 (sg02) [DMAOptimizationBase]: DRAM Rotation rotated 10 Dram address +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: address_rotation_dram finished after 0.037 seconds +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 648mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60402 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: Running tensorcopy_accel +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=11821 blocks=1 instructions=60402 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z INFO 49414 (sg02) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-08-07T13:54:31Z INFO 49414 (sg02) [TensorCopyAccel::Impl]: Accelerated 0 out of 6323 tensorcopy in Function: sg0002 average acceleration factor: -nan +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: tensorcopy_accel finished after 0.005 seconds +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 648mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60402 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: Running peephole_opts +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=11821 blocks=1 instructions=60402 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z INFO 49414 (sg02) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: peephole_opts finished after 0.016 seconds +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 648mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: Running lower_kernel +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z INFO 49414 (sg02) [LowerKernel]: Started running LowerKernel +2025-08-07T13:54:31Z INFO 49414 (sg02) [LowerKernel]: Start of kernel lowering pass, number of insts: 60406, number of allocs: 11821 +2025-08-07T13:54:31Z INFO 49414 (sg02) [LowerKernel]: Scan BKs time (s): 0.003405 +2025-08-07T13:54:31Z INFO 49414 (sg02) [LowerKernel]: Lower BKs time (s): 6e-06 +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: lower_kernel finished after 0.004 seconds +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 648mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.004 seconds +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 648mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.007 seconds +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 648mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: Running birverifier +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: birverifier finished after 0.042 seconds +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 648mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: Running dynamic_dma_scan +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: dynamic_dma_scan finished after 0.007 seconds +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 648mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z USER 49414 (sg02) [ModuleForkPass]: Running build_fdeps +2025-08-07T13:54:31Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:31Z INFO 49414 (sg02) [build_flow_deps]: Start build fdeps. Invocation: 8Thu Aug 7 13:54:31 2025 +2025-08-07T13:54:31Z INFO 49414 (sg02) [build_flow_deps]: Allocs: 11821 instructions: 60406 +2025-08-07T13:54:32Z INFO 49414 (sg02) [build_flow_deps]: Build fdeps inserted 207169 edges +2025-08-07T13:54:32Z INFO 49414 (sg02) [build_flow_deps]: Done build fdeps 207169 Thu Aug 7 13:54:32 2025 +2025-08-07T13:54:32Z USER 49414 (sg02) [ModuleForkPass]: build_fdeps finished after 0.145 seconds +2025-08-07T13:54:32Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 648mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:32Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:32Z USER 49414 (sg02) [ModuleForkPass]: Running remove_redundancies +2025-08-07T13:54:32Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:32Z INFO 49414 (sg02) [RemoveRedundancies]: remove_clobbered_writes +2025-08-07T13:54:32Z INFO 49414 (sg02) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-08-07T13:54:32Z INFO 49414 (sg02) [RemoveRedundancies]: remove_useless_insts +2025-08-07T13:54:32Z INFO 49414 (sg02) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-08-07T13:54:32Z USER 49414 (sg02) [ModuleForkPass]: remove_redundancies finished after 0.021 seconds +2025-08-07T13:54:32Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 648mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:32Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:32Z USER 49414 (sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:32Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:32Z INFO 49414 (sg02) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:32Z INFO 49414 (sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:32Z INFO 49414 (sg02) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:32Z USER 49414 (sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.297 seconds +2025-08-07T13:54:32Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 683mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:32Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:32Z USER 49414 (sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:54:32Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:32Z INFO 49414 (sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:54:32Z INFO 49414 (sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:32Z USER 49414 (sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.076 seconds +2025-08-07T13:54:32Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 662mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:32Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:32Z USER 49414 (sg02) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-08-07T13:54:32Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:32Z USER 49414 (sg02) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-08-07T13:54:32Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 662mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:32Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:32Z USER 49414 (sg02) [ModuleForkPass]: Running post_sched +2025-08-07T13:54:32Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:32Z INFO 49414 [post_scheduler]: Start PosT ScheD 3 sunda Thu Aug 7 13:54:32 2025 +2025-08-07T13:54:33Z INFO 49414 [post_scheduler]: Time-aware hwm post-sched +2025-08-07T13:54:33Z INFO 49414 [post_scheduler]: Time-aware simulation time: 8207905 +2025-08-07T13:54:34Z INFO 49414 [post_scheduler]: Done PosT ScheD Thu Aug 7 13:54:34 2025 +2025-08-07T13:54:34Z USER 49414 (sg02) [ModuleForkPass]: post_sched finished after 1.661 seconds +2025-08-07T13:54:34Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 719mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:34Z USER 49414 (sg02) [ModuleForkPass]: Running expand_scheduling_units +2025-08-07T13:54:34Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:34Z USER 49414 (sg02) [ModuleForkPass]: expand_scheduling_units finished after 0.026 seconds +2025-08-07T13:54:34Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 691mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:34Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:34Z USER 49414 (sg02) [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:54:34Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:34Z INFO 49414 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 4113 PSUM Banks +2025-08-07T13:54:34Z INFO 49414 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 4693 PSUM Banks +2025-08-07T13:54:34Z INFO 49414 (sg02) [DMAOptimizationBase]: PSUM Rotation rotated 1 PSUM Banks +2025-08-07T13:54:34Z INFO 49414 (sg02) [DMAOptimizationBase]: SB Rotation rotated 12 Sb address +2025-08-07T13:54:34Z INFO 49414 (sg02) [DMAOptimizationBase]: SB Rotation rotated 227 Sb address +2025-08-07T13:54:35Z INFO 49414 (sg02) [DMAOptimizationBase]: SB Rotation rotated 43 Sb address +2025-08-07T13:54:35Z INFO 49414 (sg02) [DMAOptimizationBase]: SB Rotation rotated 28 Sb address +2025-08-07T13:54:35Z INFO 49414 (sg02) [DMAOptimizationBase]: SB Rotation rotated 181 Sb address +2025-08-07T13:54:35Z INFO 49414 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:35Z INFO 49414 (sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:54:35Z USER 49414 (sg02) [ModuleForkPass]: address_rotation_sb finished after 1.155 seconds +2025-08-07T13:54:35Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 697mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z USER 49414 (sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:35Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z INFO 49414 (sg02) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:35Z INFO 49414 (sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:54:35Z INFO 49414 (sg02) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:35Z USER 49414 (sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.206 seconds +2025-08-07T13:54:35Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 721mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z USER 49414 (sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:54:35Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z INFO 49414 (sg02) [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:54:35Z INFO 49414 (sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-08-07T13:54:35Z INFO 49414 (sg02) [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:54:35Z USER 49414 (sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.038 seconds +2025-08-07T13:54:35Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 689mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z USER 49414 (sg02) [ModuleForkPass]: Running dep_opt +2025-08-07T13:54:35Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z INFO 49414 (sg02) [build_flow_deps]: Start build fdeps. Invocation: 9Thu Aug 7 13:54:35 2025 +2025-08-07T13:54:35Z INFO 49414 (sg02) [build_flow_deps]: Allocs: 11821 instructions: 60406 +2025-08-07T13:54:35Z INFO 49414 (sg02) [build_flow_deps]: Build fdeps inserted 203516 edges +2025-08-07T13:54:35Z INFO 49414 (sg02) [build_flow_deps]: Done build fdeps 203516 Thu Aug 7 13:54:35 2025 +2025-08-07T13:54:35Z USER 49414 (sg02) [ModuleForkPass]: dep_opt finished after 0.225 seconds +2025-08-07T13:54:35Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 694mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z USER 49414 (sg02) [ModuleForkPass]: Running report_stats +2025-08-07T13:54:35Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z INFO 49414 (sg02) [ReportStats]: Data Movement Statistics: sg0002 +┌──────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 25165824 │ +│ DMACopy │ Internal │ 1 │ 8388608 │ +│ Load │ Const -> Internal │ 4 │ 34824 │ +│ Load │ ExternalInput -> Internal │ 3018 │ 823676940 │ +│ Load │ Internal │ 100 │ 43833096 │ +│ Save │ Internal │ 675 │ 28626692 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +│ Save (Spill) │ Internal │ 20 │ 10485760 │ +└──────────────┴────────────────────────────┴───────┴───────────┘ + +2025-08-07T13:54:35Z INFO 49414 (sg02) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 64 │ 2 │ +│ 256 │ 1538 │ +│ 512 │ 593 │ +│ 1024 │ 14 │ +│ 2048 │ 34 │ +│ 4096 │ 1618 │ +│ 60768 │ 1 │ +│ 60776 │ 4 │ +│ 8388608 │ 3 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:35Z INFO 49414 (sg02) [ReportStats]: MM Stats: #MatMults 48723 #MatMult-Transposes 20371 +2025-08-07T13:54:35Z INFO 49414 (sg02) [ReportStats]: IO Tensor size combined: 773345296 +2025-08-07T13:54:35Z INFO 49414 (sg02) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input473 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input469 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input472 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input470 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input474 │ ExternalInput │ bfloat16 │ 8192 │ +│ input471 │ ExternalInput │ bfloat16 │ 8192 │ +│ input1 │ ExternalInput │ int32 │ 4096 │ +│ input3 │ ExternalInput │ float32 │ 12 │ +│ output0 │ ExternalOutput │ int32 │ 4 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-08-07T13:54:35Z INFO 49414 (sg02) [ReportStats]: Large (Internal) Tensor Statistics: +┌──────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├──────────────────────────┼──────────┼──────────┼──────────────┤ +│ all_reduce.3-buffer-2825 │ Internal │ bfloat16 │ 8388608 │ +│ dot.14-buffer-2823 │ Internal │ bfloat16 │ 8388608 │ +│ intermediate109 │ Input │ bfloat16 │ 8388608 │ +│ convert.57 │ Internal │ bfloat16 │ 8388608 │ +│ add.9 │ Internal │ bfloat16 │ 8388608 │ +│ intermediate108 │ Input │ bfloat16 │ 8388608 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ -t2849 │ Internal │ float32 │ 1048576 │ +│ -t2843 │ Internal │ float32 │ 1048576 │ +│ -t2838 │ Internal │ float32 │ 1048576 │ +└──────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-08-07T13:54:35Z USER 49414 (sg02) [ModuleForkPass]: report_stats finished after 0.016 seconds +2025-08-07T13:54:35Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 687mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z USER 49414 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:35Z USER 49414 [BackendPassManager]: mod_parallel_pass finished after 31.997 seconds +2025-08-07T13:54:35Z INFO 49414 [BackendPassManager]: curr_vmrss: 687mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 [BackendPassManager]: Output has 3 module(s), 3 function(s), 20442 memory location(s), 3 block(s), and 89295 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z USER 49414 [BackendPassManager]: Running assign_trigger_engine +2025-08-07T13:54:35Z INFO 49414 [BackendPassManager]: Inputs to assign_trigger_engine: modules=3 functions=3 allocs=20442 blocks=3 instructions=89295 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z INFO 49414 (sg00) [AssignTriggerEngine]: Assigned trigger engine for 163 DMA instructions. Moved 50 DMA instructions to CC's engines. +2025-08-07T13:54:35Z INFO 49414 (sg01) [AssignTriggerEngine]: Assigned trigger engine for 160 DMA instructions. Moved 17 DMA instructions to CC's engines. +2025-08-07T13:54:35Z INFO 49414 (sg02) [AssignTriggerEngine]: Assigned trigger engine for 714 DMA instructions. Moved 19 DMA instructions to CC's engines. +2025-08-07T13:54:35Z USER 49414 [BackendPassManager]: assign_trigger_engine finished after 0.089 seconds +2025-08-07T13:54:35Z INFO 49414 [BackendPassManager]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 [BackendPassManager]: Output has 3 module(s), 3 function(s), 20442 memory location(s), 3 block(s), and 89295 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z USER 49414 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:35Z INFO 49414 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=3 functions=3 allocs=20442 blocks=3 instructions=89295 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z USER 49414 (sg00) [SubgraphForkPass]: Running lower_local_collectives +2025-08-07T13:54:35Z INFO 49414 (sg00) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:35Z USER 49414 (sg01) [SubgraphForkPass]: Running lower_local_collectives +2025-08-07T13:54:35Z USER 49414 (sg00) [SubgraphForkPass]: lower_local_collectives finished after 0.000 seconds +2025-08-07T13:54:35Z USER 49414 (sg02) [SubgraphForkPass]: Running lower_local_collectives +2025-08-07T13:54:35Z INFO 49414 (sg00) [SubgraphForkPass]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:35Z USER 49414 (sg00) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-08-07T13:54:35Z INFO 49414 (sg00) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:35Z USER 49414 (sg00) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.000 seconds +2025-08-07T13:54:35Z INFO 49414 (sg00) [SubgraphForkPass]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg01) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=4722 blocks=1 instructions=20438 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:35Z INFO 49414 (sg02) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z USER 49414 (sg02) [SubgraphForkPass]: lower_local_collectives finished after 0.000 seconds +2025-08-07T13:54:35Z USER 49414 (sg01) [SubgraphForkPass]: lower_local_collectives finished after 0.000 seconds +2025-08-07T13:54:35Z INFO 49414 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:35Z INFO 49414 (sg02) [SubgraphForkPass]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg01) [SubgraphForkPass]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z USER 49414 (sg00) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:35Z INFO 49414 (sg00) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:35Z INFO 49414 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20438 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:35Z USER 49414 (sg01) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-08-07T13:54:35Z INFO 49414 (sg01) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=4722 blocks=1 instructions=20438 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:35Z USER 49414 (sg01) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.000 seconds +2025-08-07T13:54:35Z INFO 49414 (sg01) [SubgraphForkPass]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z USER 49414 (sg02) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-08-07T13:54:35Z INFO 49414 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20438 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:35Z USER 49414 (sg01) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:35Z INFO 49414 (sg01) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=4722 blocks=1 instructions=20438 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:35Z INFO 49414 (sg02) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z USER 49414 (sg02) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.000 seconds +2025-08-07T13:54:35Z INFO 49414 (sg02) [SubgraphForkPass]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z USER 49414 (sg02) [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:54:35Z INFO 49414 (sg02) [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z INFO 49414 (sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:35Z USER 49414 (sg00) [SubgraphForkPass]: dead_code_elim finished after 0.026 seconds +2025-08-07T13:54:35Z INFO 49414 (sg00) [SubgraphForkPass]: curr_vmrss: 678mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg00) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:35Z INFO 49414 (sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:35Z INFO 49414 (sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:54:35Z USER 49414 (sg01) [SubgraphForkPass]: dead_code_elim finished after 0.062 seconds +2025-08-07T13:54:35Z INFO 49414 (sg01) [SubgraphForkPass]: curr_vmrss: 678mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg01) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20438 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:35Z USER 49414 (sg02) [SubgraphForkPass]: dead_code_elim finished after 0.072 seconds +2025-08-07T13:54:35Z INFO 49414 (sg02) [SubgraphForkPass]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg02) [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z USER 49414 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:35Z USER 49414 [BackendPassManager]: subgraph_parallel_pass finished after 0.077 seconds +2025-08-07T13:54:35Z INFO 49414 [BackendPassManager]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 [BackendPassManager]: Output has 3 module(s), 3 function(s), 20442 memory location(s), 3 block(s), and 89295 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z USER 49414 [BackendPassManager]: Running assign_hwdge_engine +2025-08-07T13:54:35Z INFO 49414 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=3 functions=3 allocs=20442 blocks=3 instructions=89295 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z USER 49414 [BackendPassManager]: assign_hwdge_engine finished after 0.016 seconds +2025-08-07T13:54:35Z INFO 49414 [BackendPassManager]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 [BackendPassManager]: Output has 3 module(s), 3 function(s), 20442 memory location(s), 3 block(s), and 89295 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z USER 49414 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:35Z INFO 49414 [BackendPassManager]: Inputs to mod_parallel_pass: modules=3 functions=3 allocs=20442 blocks=3 instructions=89295 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z USER 49414 (sg00) [ModuleForkPass]: Running alloc_queues +2025-08-07T13:54:35Z USER 49414 (sg02) [ModuleForkPass]: Running alloc_queues +2025-08-07T13:54:35Z USER 49414 (sg01) [ModuleForkPass]: Running alloc_queues +2025-08-07T13:54:35Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:35Z INFO 49414 (sg00) [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-08-07T13:54:35Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=4722 blocks=1 instructions=20438 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:35Z INFO 49414 (sg01) [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-08-07T13:54:35Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:35Z INFO 49414 (sg02) [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-08-07T13:54:35Z INFO 49414 (sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 2 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 132 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 98 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 49 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 15 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 264 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-08-07T13:54:35Z USER 49414 (sg00) [ModuleForkPass]: alloc_queues finished after 0.002 seconds +2025-08-07T13:54:35Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:35Z USER 49414 (sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-08-07T13:54:35Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:35Z USER 49414 (sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-08-07T13:54:35Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:35Z USER 49414 (sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-08-07T13:54:35Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:35Z USER 49414 (sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-08-07T13:54:35Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:35Z USER 49414 (sg00) [ModuleForkPass]: Running lower_control +2025-08-07T13:54:35Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:35Z INFO 49414 (sg01) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 1 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 118 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 112 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 31 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 16 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 2059 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-08-07T13:54:35Z USER 49414 (sg01) [ModuleForkPass]: alloc_queues finished after 0.003 seconds +2025-08-07T13:54:35Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20438 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:35Z USER 49414 (sg01) [ModuleForkPass]: Running chain_dma_transposes +2025-08-07T13:54:35Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=4722 blocks=1 instructions=20438 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:35Z USER 49414 (sg01) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-08-07T13:54:35Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20438 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:35Z USER 49414 (sg01) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-08-07T13:54:35Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=4722 blocks=1 instructions=20438 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:35Z USER 49414 (sg01) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-08-07T13:54:35Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20438 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:35Z USER 49414 (sg01) [ModuleForkPass]: Running lower_control +2025-08-07T13:54:35Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=4722 blocks=1 instructions=20438 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:35Z INFO 49414 (sg02) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 5 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 85 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 680 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 12 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 22 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 3015 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-08-07T13:54:35Z USER 49414 (sg02) [ModuleForkPass]: alloc_queues finished after 0.012 seconds +2025-08-07T13:54:35Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:35Z INFO 49414 (sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-08-07T13:54:36Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:36Z USER 49414 (sg02) [ModuleForkPass]: Running chain_dma_transposes +2025-08-07T13:54:36Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:36Z USER 49414 (sg02) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-08-07T13:54:36Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:36Z USER 49414 (sg02) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-08-07T13:54:36Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:36Z USER 49414 (sg02) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-08-07T13:54:36Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:36Z USER 49414 (sg00) [ModuleForkPass]: lower_control finished after 0.012 seconds +2025-08-07T13:54:36Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:36Z USER 49414 (sg02) [ModuleForkPass]: Running lower_control +2025-08-07T13:54:36Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:36Z USER 49414 (sg00) [ModuleForkPass]: Running dep_reduction +2025-08-07T13:54:36Z INFO 49414 (sg00) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=3899 blocks=1 instructions=8451 Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:36Z INFO 49414 (sg00) [DepReduction]: Start Dependency Reduction +2025-08-07T13:54:36Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:36Z INFO 49414 (sg00) [DepReduction]: Processing async instrs... +2025-08-07T13:54:36Z INFO 49414 (sg00) [DepReduction]: Processing secondary edges per engine... +2025-08-07T13:54:36Z INFO 49414 (sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 7187 +2025-08-07T13:54:36Z INFO 49414 (sg01) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-08-07T13:54:36Z INFO 49414 (sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 7653 +2025-08-07T13:54:36Z INFO 49414 (sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 7653 +2025-08-07T13:54:36Z USER 49414 (sg01) [ModuleForkPass]: lower_control finished after 0.032 seconds +2025-08-07T13:54:36Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 677mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20438 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:36Z USER 49414 (sg01) [ModuleForkPass]: Running dep_reduction +2025-08-07T13:54:36Z INFO 49414 (sg01) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=4722 blocks=1 instructions=20438 Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:36Z INFO 49414 (sg01) [DepReduction]: Start Dependency Reduction +2025-08-07T13:54:36Z INFO 49414 (sg01) [DepReduction]: Processing async instrs... +2025-08-07T13:54:36Z INFO 49414 (sg01) [DepReduction]: Processing secondary edges per engine... +2025-08-07T13:54:36Z INFO 49414 (sg01) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 25928 +2025-08-07T13:54:36Z INFO 49414 (sg00) [DepReduction]: Num Async removed: 0 +2025-08-07T13:54:36Z INFO 49414 (sg00) [DepReduction]: Finished dependency reduction: 44244 removed, new total 3760 +2025-08-07T13:54:36Z INFO 49414 (sg00) [DepReduction]: Finished Dependency Reduction +2025-08-07T13:54:36Z USER 49414 (sg00) [ModuleForkPass]: dep_reduction finished after 0.074 seconds +2025-08-07T13:54:36Z INFO 49414 (sg00) [ModuleForkPass]: curr_vmrss: 681mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49414 (sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3899 memory location(s), 1 block(s), and 8451 instruction(s). Max writers: 32 Max Readers: 1312 +2025-08-07T13:54:36Z INFO 49414 (sg02) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-08-07T13:54:36Z INFO 49414 (sg01) [DepReduction]: Processing redundant descendants, Done. Num edges removed 28202 +2025-08-07T13:54:36Z INFO 49414 (sg01) [DepReduction]: Processing async instrs, Done. Num edges removed 28202 +2025-08-07T13:54:36Z USER 49414 (sg02) [ModuleForkPass]: lower_control finished after 0.106 seconds +2025-08-07T13:54:36Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 681mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:36Z USER 49414 (sg02) [ModuleForkPass]: Running dep_reduction +2025-08-07T13:54:36Z INFO 49414 (sg02) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=11821 blocks=1 instructions=60406 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:36Z INFO 49414 (sg02) [DepReduction]: Start Dependency Reduction +2025-08-07T13:54:36Z INFO 49414 (sg02) [DepReduction]: Processing async instrs... +2025-08-07T13:54:36Z INFO 49414 (sg02) [DepReduction]: Processing secondary edges per engine... +2025-08-07T13:54:36Z INFO 49414 (sg02) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 59058 +2025-08-07T13:54:36Z INFO 49414 (sg01) [DepReduction]: Num Async removed: 0 +2025-08-07T13:54:36Z INFO 49414 (sg01) [DepReduction]: Finished dependency reduction: 132848 removed, new total 6419 +2025-08-07T13:54:36Z INFO 49414 (sg01) [DepReduction]: Finished Dependency Reduction +2025-08-07T13:54:36Z USER 49414 (sg01) [ModuleForkPass]: dep_reduction finished after 0.204 seconds +2025-08-07T13:54:36Z INFO 49414 (sg01) [ModuleForkPass]: curr_vmrss: 698mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49414 (sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4722 memory location(s), 1 block(s), and 20438 instruction(s). Max writers: 48 Max Readers: 1904 +2025-08-07T13:54:36Z INFO 49414 (sg02) [DepReduction]: Processing redundant descendants, Done. Num edges removed 62932 +2025-08-07T13:54:36Z INFO 49414 (sg02) [DepReduction]: Processing async instrs, Done. Num edges removed 62932 +2025-08-07T13:54:36Z INFO 49414 (sg02) [DepReduction]: Num Async removed: 0 +2025-08-07T13:54:36Z INFO 49414 (sg02) [DepReduction]: Finished dependency reduction: 413818 removed, new total 18893 +2025-08-07T13:54:36Z INFO 49414 (sg02) [DepReduction]: Finished Dependency Reduction +2025-08-07T13:54:36Z USER 49414 (sg02) [ModuleForkPass]: dep_reduction finished after 0.673 seconds +2025-08-07T13:54:36Z INFO 49414 (sg02) [ModuleForkPass]: curr_vmrss: 724mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49414 (sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11821 memory location(s), 1 block(s), and 60406 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:36Z USER 49414 [ModuleForkPass]: Compilation status: Total modules: 3, Passed: 3, Failed: 0 +2025-08-07T13:54:36Z USER 49414 [BackendPassManager]: mod_parallel_pass finished after 0.838 seconds +2025-08-07T13:54:36Z INFO 49414 [BackendPassManager]: curr_vmrss: 720mb, ru_maxrss: 739mb (delta=0mb) +2025-08-07T13:54:36Z INFO 49414 [BackendPassManager]: Output has 3 module(s), 3 function(s), 20442 memory location(s), 3 block(s), and 89295 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:36Z USER 49414 [BackendPassManager]: Running nc_parallel_pass +2025-08-07T13:54:36Z INFO 49414 [BackendPassManager]: Inputs to nc_parallel_pass: modules=3 functions=3 allocs=20442 blocks=3 instructions=89295 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:36Z USER 49414 [CoreForkPass]: Running bir_linker +2025-08-07T13:54:36Z INFO 49414 [CoreForkPass]: Inputs to bir_linker: modules=3 functions=3 allocs=20442 blocks=3 instructions=89295 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:36Z INFO 49414 (sgLnk) [BirLinker]: bir_linker cwd: +2025-08-07T13:54:36Z INFO 49414 (sgLnk) [BirLinker]: Num intermediates 111 +2025-08-07T13:54:36Z INFO 49414 (sgLnk) [BirLinker]: Num Module Definitions 3 +2025-08-07T13:54:36Z INFO 49414 (sgLnk) [BirLinker]: Linking to a call-graph structure +2025-08-07T13:54:36Z INFO 49414 (sgLnk) [BirLinker]: Added a new SpillReload Que qPoolPIOParam0 +2025-08-07T13:54:37Z INFO 49414 (sgLnk) [BirLinker]: tensor_map verification successful. +2025-08-07T13:54:37Z INFO 49414 (sgLnk) [BirLinker]: Writing updated tensor_map /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/neuronxcc-dfnjq5y6/sgLnk/sg00/tensor_map.json +2025-08-07T13:54:37Z INFO 49414 (sgLnk) [BirLinker]: PostLink Stats: #MatMults 555691 #MatMult-Transposes 88323 +2025-08-07T13:54:37Z INFO 49414 (sgLnk) [BirLinker]: Total Intermediate MMTs 9776 #out: 9216 #inp: 560 #symmetric: 0 +2025-08-07T13:54:37Z INFO 49414 (sgLnk) [BirLinker]: Total Intermediate IOs with MMTs: 38 #out: 36 #inp: 2 #both: 0 +2025-08-07T13:54:37Z INFO 49414 (sgLnk) [BirLinker]: releasing pre-link modules +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [BirLinker]: linking Done. +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: bir_linker finished after 1.193 seconds +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: curr_vmrss: 1152mb, ru_maxrss: 1152mb (delta=413mb) +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 89359 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: Running postlnk_dma_report +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Inputs to postlnk_dma_report: modules=1 functions=4 allocs=21101 blocks=4 instructions=89359 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DMAReport]: DMA Report: Bytes loaded or saved 1411019560, 81.1284% input load, 1.30048% output write, 17.5712% spill/reload +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: postlnk_dma_report finished after 0.011 seconds +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: curr_vmrss: 609mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 89359 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: Running report_stats +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Inputs to report_stats: modules=1 functions=4 allocs=21101 blocks=4 instructions=89359 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ReportStats]: Data Movement Statistics: main +┌─────────────┬──────┬───────┬───────┐ +│ Instruction │ Kind │ Count │ Bytes │ +└─────────────┴──────┴───────┴───────┘ + +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ReportStats]: Data Movement Statistics: sg0000 +┌──────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 17 │ 9957281792 │ +│ DMACopy │ Internal -> ExternalOutput │ 64 │ 67108864 │ +│ DMACopy │ Internal -> Output │ 1 │ 16777216 │ +│ Load │ Const -> Internal │ 3 │ 65792 │ +│ Load │ ExternalInput -> Internal │ 148 │ 58733056 │ +│ Load │ Internal │ 178 │ 45367296 │ +│ Save │ Internal │ 62 │ 16252928 │ +│ Save │ Internal -> Output │ 37 │ 9961474 │ +│ Save (Spill) │ Internal │ 51 │ 11681792 │ +└──────────────┴────────────────────────────┴───────┴────────────┘ + +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 3 │ +│ 64 │ 1 │ +│ 256 │ 3 │ +│ 512 │ 1 │ +│ 896 │ 6 │ +│ 1024 │ 46 │ +│ 1920 │ 32 │ +│ 2048 │ 288 │ +│ 4096 │ 116 │ +│ 262144 │ 64 │ +│ 8388608 │ 2 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ReportStats]: Data Movement Statistics: sg0001 +┌──────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 25165824 │ +│ DMACopy │ Internal -> ExternalOutput │ 64 │ 67108864 │ +│ DMACopy │ Internal -> Output │ 1 │ 16777216 │ +│ Load │ Const -> Internal │ 2 │ 65536 │ +│ Load │ ExternalInput -> Internal │ 1972 │ 260063744 │ +│ Load │ Input -> Internal │ 6 │ 2097152 │ +│ Load │ Internal │ 132 │ 47448064 │ +│ Save │ Internal │ 99 │ 31195136 │ +│ Save │ Internal -> Output │ 17 │ 8388610 │ +│ Save (Spill) │ Internal │ 44 │ 13041664 │ +└──────────────┴────────────────────────────┴───────┴───────────┘ + +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 3 │ +│ 64 │ 2 │ +│ 256 │ 1538 │ +│ 1024 │ 73 │ +│ 2048 │ 156 │ +│ 4096 │ 500 │ +│ 262144 │ 64 │ +│ 8388608 │ 5 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ReportStats]: Data Movement Statistics: sg0002 +┌──────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 25165824 │ +│ DMACopy │ Internal │ 1 │ 8388608 │ +│ Load │ Const -> Internal │ 4 │ 34824 │ +│ Load │ ExternalInput -> Internal │ 3018 │ 823676940 │ +│ Load │ Internal │ 100 │ 43833096 │ +│ Save │ Internal │ 675 │ 28626692 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +│ Save (Spill) │ Internal │ 20 │ 10485760 │ +└──────────────┴────────────────────────────┴───────┴───────────┘ + +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 64 │ 2 │ +│ 256 │ 1538 │ +│ 512 │ 593 │ +│ 1024 │ 14 │ +│ 2048 │ 34 │ +│ 4096 │ 1618 │ +│ 60768 │ 1 │ +│ 60776 │ 4 │ +│ 8388608 │ 3 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ReportStats]: MM Stats: #MatMults 67451 #MatMult-Transposes 23587 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ReportStats]: IO Tensor size combined: 9981025324 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input76_sg0000 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input473_sg0002 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input76 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input473 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input131 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input109 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input98 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input153 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input87 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input175 │ ExternalInput │ bfloat16 │ 50331648 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────┬───────────────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────┼───────────────────┼──────────┼──────────────┤ +│ intermediate1 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate4 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate18 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate9 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate15 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate12 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate27 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate24 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate21 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate6 │ InternalInterface │ bfloat16 │ 8388608 │ +└─────────────────┴───────────────────┴──────────┴──────────────┘ + +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: report_stats finished after 0.023 seconds +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: curr_vmrss: 609mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 89359 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: Running coloring_allocator_dram_post_lnk +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Inputs to coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=21101 blocks=4 instructions=89359 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Local +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: reserved space = 8342046740 bytes +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: spill space = 605552712 bytes +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: aligned spill space = 605700096 bytes +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: renumber locations +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: size = 111 +2025-08-07T13:54:38Z INFO 49414 []: find first defs for local +2025-08-07T13:54:38Z INFO 49414 []: find first defs for global +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: Num intervals 111 Num locations 111 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: simplify interference graph +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: initialize low and high +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: lo = 111 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: hi = 0 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: total = 111 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: simplify +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 55443456 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: select ranges +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 55443456 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: Real CC buffer size 55443456 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 98971648 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: coloring_allocator_dram_post_lnk finished after 0.064 seconds +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: curr_vmrss: 609mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 89359 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_post_lnk +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=21101 blocks=4 instructions=89359 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_post_lnk finished after 0.038 seconds +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: curr_vmrss: 610mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 89359 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: Running lower_dynamic_dma +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=4 allocs=21101 blocks=4 instructions=89359 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: lower_dynamic_dma finished after 0.012 seconds +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: curr_vmrss: 610mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 89359 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: Running legalize_dynamic_dma +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=4 allocs=21101 blocks=4 instructions=89359 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: legalize_dynamic_dma finished after 0.030 seconds +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: curr_vmrss: 610mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 89359 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: Running lower_dma +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Inputs to lower_dma: modules=1 functions=4 allocs=21101 blocks=4 instructions=89359 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 72840/72840 (100% DGE) + power-of-2 partition : 72878/72919 (99.9438% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 72878/72919 (99.9438% DGE) + Cast (DGE/DMA) + 128 partition : 145/145 (100% DGE) + power-of-2 partition : 145/146 (99.3151% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 145/146 (99.3151% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 0/9782 (0% DGE) + power-of-2 partition : 0/10788 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/10788 (0% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 36 + Transpose : 1 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 2320/2320 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: lower_dma finished after 0.127 seconds +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: curr_vmrss: 610mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 89359 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: Running expand_all_engine +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=4 allocs=21101 blocks=4 instructions=89359 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: expand_all_engine finished after 0.013 seconds +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: curr_vmrss: 610mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 89359 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: Running alloc_semaphores +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=4 allocs=21101 blocks=4 instructions=89359 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: alloc_semaphores finished after 0.074 seconds +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: curr_vmrss: 610mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 89359 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: Running expand_inst_late +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=4 allocs=21101 blocks=4 instructions=89359 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: expand_inst_late finished after 0.069 seconds +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: curr_vmrss: 610mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 89634 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: Running seq_inst_opt +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=4 allocs=21101 blocks=4 instructions=89634 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [SeqInstOpt]: Removing 205 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [SeqInstOpt]: Removing 63 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: seq_inst_opt finished after 0.010 seconds +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: curr_vmrss: 610mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 89366 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: Running lower_sync +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Inputs to lower_sync: modules=1 functions=4 allocs=21101 blocks=4 instructions=89366 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: lower_sync finished after 0.043 seconds +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: curr_vmrss: 617mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 95237 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: Running lower_act +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Inputs to lower_act: modules=1 functions=4 allocs=21101 blocks=4 instructions=95237 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: lower_act finished after 0.011 seconds +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: curr_vmrss: 617mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 95266 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: Running lower_dve +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Inputs to lower_dve: modules=1 functions=4 allocs=21101 blocks=4 instructions=95266 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: lower_dve finished after 0.108 seconds +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: curr_vmrss: 626mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 95266 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: Running lower_ap +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Inputs to lower_ap: modules=1 functions=4 allocs=21101 blocks=4 instructions=95266 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: lower_ap finished after 0.016 seconds +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: curr_vmrss: 626mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 95266 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: Running coloring_allocator_reg +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=4 allocs=21101 blocks=4 instructions=95266 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: allocating REG +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: allocating REG +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: renumber registers +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: size = 3 +2025-08-07T13:54:38Z INFO 49414 []: find first defs for local reg +2025-08-07T13:54:38Z INFO 49414 []: find first defs for global reg +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: live range analysis +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: find costs +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: simplify interference graph +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: initialize low and high +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: lo = 3 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: hi = 0 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: inf = 0 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: total = 3 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: simplify +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: new candidates = 0 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: select ranges +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: no more spills +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: allocating REG +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: renumber registers +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: size = 1 +2025-08-07T13:54:38Z INFO 49414 []: find first defs for local reg +2025-08-07T13:54:38Z INFO 49414 []: find first defs for global reg +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: live range analysis +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: find costs +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: simplify interference graph +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: initialize low and high +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: lo = 1 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: hi = 0 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: inf = 0 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: total = 1 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: simplify +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: new candidates = 0 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: select ranges +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: no more spills +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: allocating REG +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: main loop iteration 1 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: renumber registers +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: size = 4 +2025-08-07T13:54:38Z INFO 49414 []: find first defs for local reg +2025-08-07T13:54:38Z INFO 49414 []: find first defs for global reg +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: live range analysis +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: find costs +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: simplify interference graph +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: initialize low and high +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: lo = 4 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: hi = 0 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: inf = 0 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: total = 4 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: simplify +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: new candidates = 0 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: select ranges +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: no more spills +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-08-07T13:54:38Z USER 49414 [CoreForkPass]: coloring_allocator_reg finished after 0.108 seconds +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: curr_vmrss: 629mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [CoreForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 95266 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [BackendPassManager]: nc_parallel_pass finished after 2.039 seconds +2025-08-07T13:54:38Z INFO 49414 [BackendPassManager]: curr_vmrss: 629mb, ru_maxrss: 1152mb (delta=413mb) +2025-08-07T13:54:38Z INFO 49414 [BackendPassManager]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 95266 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:38Z INFO 49414 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=4 allocs=21101 blocks=4 instructions=95266 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [ModuleForkPass]: Running birverifier +2025-08-07T13:54:38Z INFO 49414 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=4 allocs=21101 blocks=4 instructions=95266 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [ModuleForkPass]: birverifier finished after 0.091 seconds +2025-08-07T13:54:38Z INFO 49414 [ModuleForkPass]: curr_vmrss: 638mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [ModuleForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 95266 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [BackendPassManager]: mod_parallel_pass finished after 0.094 seconds +2025-08-07T13:54:38Z INFO 49414 [BackendPassManager]: curr_vmrss: 638mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [BackendPassManager]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 95266 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:54:38Z INFO 49414 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=4 allocs=21101 blocks=4 instructions=95266 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:54:38Z INFO 49414 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=4 allocs=21101 blocks=4 instructions=95266 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:54:38Z INFO 49414 [SubgraphForkPass]: curr_vmrss: 638mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [SubgraphForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 95266 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [BackendPassManager]: subgraph_parallel_pass finished after 0.002 seconds +2025-08-07T13:54:38Z INFO 49414 [BackendPassManager]: curr_vmrss: 638mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:38Z INFO 49414 [BackendPassManager]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 95266 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:54:38Z INFO 49414 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=4 allocs=21101 blocks=4 instructions=95266 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z USER 49414 [ModuleForkPass]: Running codegen +2025-08-07T13:54:38Z INFO 49414 [ModuleForkPass]: Inputs to codegen: modules=1 functions=4 allocs=21101 blocks=4 instructions=95266 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [Codegen]: Total compiler allocated DRAM tensors: 0.0921745 GB +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 7.62851 │ +│ ExternalOutput │ 0.0703125 │ +│ Const │ 0.000154741 │ +└────────────────┴─────────────┘ + +2025-08-07T13:54:38Z INFO 49414 (sgLnk) [Codegen]: Total runtime managed DRAM tensors: 7.69898 GB +2025-08-07T13:54:39Z INFO 49414 (sgLnk) [Codegen]: Instruction Stats: +2025-08-07T13:54:39Z INFO 49414 (sgLnk) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 67459 │ +│ LDWEIGHTS │ 67079 │ +│ ACTIVATE │ 10464 │ +│ EVENT_SEMAPHORE │ 5871 │ +│ UNKNOWN(0xd4) │ 5338 │ +│ PSEUDO_DMA_TRIGGER │ 1381 │ +│ TENSOR_TENSOR │ 1021 │ +│ UNKNOWN(0x24) │ 448 │ +│ UNKNOWN(0x8d) │ 448 │ +│ MATCH_VALUE_LOAD │ 441 │ +│ TENSOR_SCALAR_ADDR │ 341 │ +│ UNKNOWN(0xe8) │ 260 │ +│ UNKNOWN(0x8b) │ 240 │ +│ COPY │ 238 │ +│ FIND_INDEX8 │ 224 │ +│ MAX8 │ 224 │ +│ MATCH_REPLACE8 │ 217 │ +│ TENSOR_SCALAR │ 195 │ +│ UNKNOWN(0xd3) │ 185 │ +│ MEMSET │ 175 │ +│ UNKNOWN(0xda) │ 153 │ +│ TENSOR_REDUCE │ 140 │ +│ UNKNOWN(0x8a) │ 128 │ +│ UNKNOWN(0x92) │ 128 │ +│ GATHER │ 99 │ +│ POOL_BUFFER_LOAD │ 99 │ +│ CAST │ 97 │ +│ RECIPROCAL │ 67 │ +│ ACT_TABLE_LOAD │ 29 │ +│ PSEUDO_BRANCH_LABEL │ 20 │ +│ IOTA │ 19 │ +│ UNKNOWN(0xd2) │ 15 │ +│ PSEUDO_DMA_REARM │ 12 │ +│ UNKNOWN(0xcf) │ 12 │ +│ UNKNOWN(0xd9) │ 8 │ +│ MOVE │ 4 │ +│ STREAM_SHUFFLE │ 4 │ +│ LOAD_MASK_SELECT │ 4 │ +│ ALU_OP │ 2 │ +│ UNKNOWN(0xe5) │ 2 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +│ TENSOR_SCALAR │ 1 │ +│ RNG │ 1 │ +└─────────────────────┴───────┘ + +2025-08-07T13:54:39Z INFO 49414 (sgLnk) [Codegen]: +┌────────────┬────────┐ +│ Engine │ Count │ +├────────────┼────────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 8796 │ +│ Scalar │ 12988 │ +│ Tensor │ 136451 │ +│ SyncDMA │ 0 │ +│ Vector │ 4470 │ +│ Sync │ 609 │ +│ All │ 0 │ +└────────────┴────────┘ + +2025-08-07T13:54:39Z INFO 49414 (sgLnk) [Codegen]: Total instructions: 163314 (0.00973427 GB) +2025-08-07T13:54:39Z INFO 49414 (sgLnk) [Codegen]: Total DynamicDMA instruction count: 5338 +2025-08-07T13:54:39Z USER 49414 (sgLnk) [Codegen]: isa_gen finished after 0.357 seconds +2025-08-07T13:54:39Z INFO 49414 (sgLnk) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────────────┼────────────────┤ +│ qActSpillReload0_defId_0 │ 25088 │ +│ qActSpillReload0_defId_1 │ 28672 │ +│ qActSpillReload0_defId_2 │ 22188 │ +│ qDVESpillReload0_defId_0 │ 3840 │ +│ qDVESpillReload0_defId_1 │ 6528 │ +│ qDVESpillReload0_defId_2 │ 2056 │ +│ qPoolIO0 │ 2 │ +│ qPoolPIOParam0 │ 72 │ +│ qPoolSpillReload0_defId_0 │ 16896 │ +│ qPoolSpillReload0_defId_1 │ 4096 │ +│ qPoolSpillReload0_defId_2 │ 4870 │ +│ qSPIO0 │ 147610 │ +│ qSPSpillReload0_defId_0 │ 33538 │ +│ qSPSpillReload0_defId_1 │ 30208 │ +│ qSPSpillReload0_defId_2 │ 17950 │ +└───────────────────────────┴────────────────┘ + +Total descriptors: 343614 (0.00512025 GB) +2025-08-07T13:54:39Z INFO 49414 (sgLnk) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +│ qPoolIO0 │ 16 │ +│ qPoolPIOParam0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-08-07T13:54:39Z INFO 49414 (sgLnk) [Codegen]: Tensors with largest descriptor count: +┌────────────────────────────┬──────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├────────────────────────────┼──────────┼──────────┼──────────────────┤ +│ add.9_sg0002 │ Internal │ bfloat16 │ 17 │ +│ all_gather.1_i0_sg0000 │ Internal │ bfloat16 │ 24 │ +│ all_gather.1_i1_sg0000 │ Internal │ bfloat16 │ 25 │ +│ dot.11-buffer-1831_sg0001 │ Internal │ bfloat16 │ 32 │ +│ dot.4-buffer-2238_sg0000 │ Internal │ bfloat16 │ 32 │ +│ dot.14-buffer-2823_sg0002 │ Internal │ bfloat16 │ 32 │ +│ dot.7-buffer-1826_sg0001 │ Internal │ bfloat16 │ 32 │ +│ all-reduce.519.1841_sg0001 │ Internal │ bfloat16 │ 35 │ +│ add.4_sg0001 │ Internal │ bfloat16 │ 51 │ +│ convert.59_sg0002 │ Internal │ float32 │ 599 │ +└────────────────────────────┴──────────┴──────────┴──────────────────┘ + +2025-08-07T13:54:39Z USER 49414 (sgLnk) [Codegen]: dma_desc_gen finished after 0.026 seconds +2025-08-07T13:54:39Z INFO 49414 (sgLnk) [Codegen]: Estimated peak DRAM usage: 7.80601 GB +2025-08-07T13:54:39Z INFO 49414 (sgLnk) [Codegen]: Generating debug info +2025-08-07T13:54:39Z WARNING 49414 (sgLnk) [Codegen]: Found 127 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-08-07T13:54:39Z USER 49414 (sgLnk) [Codegen]: debug_info_gen finished after 0.205 seconds +2025-08-07T13:54:39Z USER 49414 [ModuleForkPass]: codegen finished after 0.618 seconds +2025-08-07T13:54:39Z INFO 49414 [ModuleForkPass]: curr_vmrss: 721mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:39Z INFO 49414 [ModuleForkPass]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 95266 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:39Z USER 49414 [BackendPassManager]: mod_parallel_pass finished after 0.622 seconds +2025-08-07T13:54:39Z INFO 49414 [BackendPassManager]: curr_vmrss: 721mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:39Z INFO 49414 [BackendPassManager]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 95266 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:39Z USER 49414 [BackendPassManager]: Running neff_packager +2025-08-07T13:54:39Z INFO 49414 [BackendPassManager]: Inputs to neff_packager: modules=1 functions=4 allocs=21101 blocks=4 instructions=95266 Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:39Z INFO 49414 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.9-1360_CRSM.npy +2025-08-07T13:54:39Z INFO 49414 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_1547_CRSM.npy +2025-08-07T13:54:39Z INFO 49414 [NeffPackager]: FileDeDuper file not found value_sg0000_t2261_CRSM.npy +2025-08-07T13:54:39Z INFO 49414 [NeffPackager]: FileDeDuper file not found value_sg0001_identity_1184_CRSM.npy +2025-08-07T13:54:39Z INFO 49414 [NeffPackager]: FileDeDuper file not found value_sg0001_t1844_CRSM.npy +2025-08-07T13:54:39Z INFO 49414 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.24_CRSM.npy +2025-08-07T13:54:39Z INFO 49414 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.25_CRSM.npy +2025-08-07T13:54:39Z INFO 49414 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.26-822-934_CRSM.npy +2025-08-07T13:54:39Z INFO 49414 [NeffPackager]: FileDeDuper file not found value_sg0002_identity_1077_CRSM.npy +2025-08-07T13:54:39Z INFO 49414 [NeffPackager]: Const File de-dup saved 0 KB of memory footprint +2025-08-07T13:54:39Z WARNING 49414 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.169490.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-08-07T13:54:39Z INFO 49414 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.neff +2025-08-07T13:54:39Z INFO 49414 [NeffFileWriter]: IR signature: d0aab7e369a46fb7143fb478eb9b019f for neff artifacts +2025-08-07T13:54:39Z USER 49414 [BackendPassManager]: neff_packager finished after 0.135 seconds +2025-08-07T13:54:39Z INFO 49414 [BackendPassManager]: curr_vmrss: 721mb, ru_maxrss: 1152mb (delta=0mb) +2025-08-07T13:54:39Z INFO 49414 [BackendPassManager]: Output has 1 module(s), 4 function(s), 21101 memory location(s), 4 block(s), and 95266 instruction(s). Max writers: 594 Max Readers: 20371 +2025-08-07T13:54:39Z INFO 49414 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ sg00 │ Peak scratchpad usage: local │ 0.036621 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: local │ 0.036621 GB │ +│ nc00 │ sg01 │ Peak scratchpad usage: local │ 0.051636 GB │ +│ nc00 │ sg01 │ Total size of allocated tensors: local │ 0.062500 GB │ +│ nc00 │ sg02 │ Peak scratchpad usage: local │ 0.036148 GB │ +│ nc00 │ sg02 │ Total size of allocated tensors: local │ 0.050140 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.051636 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.092175 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.564102 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.092175 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-08-07T13:54:39Z INFO 49414 [BackendDriver]: Backend completed successfully, tearing down. +2025-08-07T13:54:39Z INFO 48502 [job.WalrusDriver.0]: new_lnkState: {"model": ["/home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "walrus_bir.out.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "cached_wavegraph": "walrus_bir.out.json", "state_dir": "/home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/neuronxcc-dfnjq5y6/sgLnk/sg00", "state_id": "sgLnk"} +2025-08-07T13:54:39Z INFO 48502 [job.WalrusDriver.0]: MTBackend: completed successfully. +2025-08-07T13:54:39Z INFO 48502 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-08-07T13:54:39Z INFO 48502 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-08-07T13:54:39Z INFO 48502 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "walrus_bir.out.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "cached_wavegraph": "walrus_bir.out.json", "state_dir": "/home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/neuronxcc-dfnjq5y6/sgLnk/sg00", "state_id": "sgLnk"}' --pipeline BIRLinker +2025-08-07T13:54:39Z INFO 48502 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/neuronxcc-dfnjq5y6 +2025-08-07T13:54:39Z INFO 48502 [job.BIRLinker.0]: Linking already done. +2025-08-07T13:54:39Z INFO 48502 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-08-07T13:54:39Z INFO 48502 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-08-07T13:54:39Z INFO 48502 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-08-07T13:54:39Z INFO 48502 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-08-07T13:54:39Z INFO 48502 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-08-07T13:54:39Z INFO 48502 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-08-07T13:54:39Z INFO 48502 [job.NeffWrapper.0]: Processing input #0 +2025-08-07T13:54:39Z INFO 48502 [job.NeffWrapper.0]: Start NeffWrapper +2025-08-07T13:54:39Z INFO 48502 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb --neff /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.neff --io_transposes /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/neuronxcc-dfnjq5y6/io_transposes.json --output /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/context_encoding_model/_tp0_bk3/neuronxcc-dfnjq5y6/hlo_netlist.json +2025-08-07T13:54:40Z INFO 48502 [job.NeffWrapper.0]: There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-08-07T13:54:40Z INFO 48502 [job.NeffWrapper.0]: Job #0 finished +2025-08-07T13:54:40Z INFO 48502 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-08-07T13:54:40Z INFO 48502 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-08-07T13:54:40Z INFO 48502 [pipeline.Pipeline.0]: Job #0 finished +2025-08-07T13:54:40Z INFO 47918 [root]: Subcommand returned with exitcode=0 diff --git a/context_encoding_model/_tp0_bk3/metaneff.pb b/context_encoding_model/_tp0_bk3/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..c0db298b005c8492612857ee031a5f69cd363890 --- /dev/null +++ b/context_encoding_model/_tp0_bk3/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ea141404110996ab61ca5ba70e86499e6c4390e0b31c1ef947cf95911917766 +size 1816103 diff --git a/context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb b/context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..ca2f3236c671c248a7d42b0fd32c6952e1cb0ac4 --- /dev/null +++ b/context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9068f3ba4f55e1b8b35adde74efc6a9e617baa344783aaee62353f9181c3092c +size 1893189 diff --git a/context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.neff b/context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.neff new file mode 100644 index 0000000000000000000000000000000000000000..f28021a86f6462720c45f0d0c5e263d0bea73428 --- /dev/null +++ b/context_encoding_model/_tp0_bk3/model.MODULE_b3ddbc97e5f0d1d64c82+155de413.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3602ab29177b01531c0dbdb62bc869556ef53a934ba98dd3bd846e75e171cc3a +size 2561024 diff --git a/context_encoding_model/_tp0_bk3/neuron_config.json b/context_encoding_model/_tp0_bk3/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e1f2902bd141d27b32a97f706b2df97a34831c04 --- /dev/null +++ b/context_encoding_model/_tp0_bk3/neuron_config.json @@ -0,0 +1,220 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "Qwen/Qwen3-8B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 12288, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 36, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 1, + "bucket_n_active_tokens": true, + "buckets": [ + 1024 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 2, + "chunked_prefill_config": null, + "context_encoding_buckets": [ + 1024 + ], + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": true, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 1, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": false, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 1, + "lora_config": null, + "max_batch_size": 1, + "max_context_length": 1024, + "max_length": 1024, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 1024, + "n_positions": 1024, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 1024, + "pa_num_blocks": 1, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 1024, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 1, + "speculation_length": 0, + "start_rank_id": 0, + "target": null, + "tile_cc": false, + "tkg_batch_size": 1, + "token_generation_buckets": null, + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 32, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/layout_opt/command.txt b/layout_opt/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..46d6fa42a12e872a9e04b4e312b9fc70e847859c --- /dev/null +++ b/layout_opt/command.txt @@ -0,0 +1 @@ +neuronx-cc compile graph.hlo --framework XLA --target trn1 --output graph.neff --model-type=transformer -O1 --lnc=1 '--internal-hlo2tensorizer-options=--experimental-unsafe-fp8e4m3fn-as-fp8e4m3 --verify-hlo=false' --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/layout_opt/graph.neff b/layout_opt/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..f0d2ad1413c3a3e3990c7436f9721c80992e04ea --- /dev/null +++ b/layout_opt/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eafae43287bda2aa58740df223d211d8e3638af29e402c9cc6cbcadcf302ddde +size 5786624 diff --git a/layout_opt/log-neuron-cc.txt b/layout_opt/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..212469822a10fa960c521042dd41aa890ae833ec --- /dev/null +++ b/layout_opt/log-neuron-cc.txt @@ -0,0 +1,2066 @@ +2025-08-07T13:57:08Z INFO 50051 [root]: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/bin/neuronx-cc compile /home/ubuntu/qwen3/layout_opt/model/graph.hlo --framework XLA --target trn1 --output /home/ubuntu/qwen3/layout_opt/graph.neff --model-type=transformer -O1 --lnc=1 '--internal-hlo2tensorizer-options=--experimental-unsafe-fp8e4m3fn-as-fp8e4m3 --verify-hlo=false' --logfile=/home/ubuntu/qwen3/layout_opt/log-neuron-cc.txt --verbose=35 +2025-08-07T13:57:08Z INFO 50051 [root]: NeuronX Compiler version 2.20.9961.0+0acef03a Python version 3.10.12 HWM version 2.20.0.9961+0acef03a NumPy version 1.26.4 Running on AMI ami-040348201d80b58ad Running in region usw2-az4 +2025-08-07T13:57:08Z INFO 50115 [root]: XLA detected +2025-08-07T13:57:08Z INFO 50115 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-08-07T13:57:08Z INFO 50115 [root]: Intermediate files stored in /home/ubuntu/neuronxcc-6q5tifbo, output in /home/ubuntu +2025-08-07T13:57:08Z INFO 50115 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-08-07T13:57:08Z INFO 50115 [pipeline.Pipeline.0]: Processing input #0 +2025-08-07T13:57:08Z INFO 50115 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-08-07T13:57:08Z INFO 50115 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-08-07T13:57:08Z INFO 50115 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-08-07T13:57:08Z INFO 50115 [job.HLOToTensorizer.0]: Processing input #0 +2025-08-07T13:57:08Z INFO 50115 [job.HLOToTensorizer.0]: IR signature: 12b45b028e502b2dd8c42c1287fbdbea434454143a30d473806853bc18673d98 for graph.hlo +2025-08-07T13:57:08Z INFO 50115 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/layout_opt/model/graph.hlo --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --layers-per-module=1 --partition --emit-tensor-level-dropout-ops --experimental-unsafe-fp8e4m3fn-as-fp8e4m3 --verify-hlo=false --native-to-custom-softmax --partitioner-opts='--transformer' +2025-08-07T13:57:08Z INFO 50115 [job.HLOToTensorizer.0]: DEBUG: needsModular? No. macCnt 0 num non-trivial Ops 325 +INFO: Switching to single-module compile. PrePartitionPipe skipped. +INFO: Found memory bound graph +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 0 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 0 +INFO: Traffic has found 8191043584 +INFO: AIF 0 +HLO Ops used in computation: parameter reshape transpose tuple +Warning: Could not open file debug_info_hlo_partitions.json +2025-08-07 13:57:08.744781: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:52] Truncating long HLO operator name %last = tuple(%p76, %transpose.325, %transpose.326, %transpose.327, %p80, %transpose.328, %p82, %transpose.329, %transpose.330, %transpose.331, %transpose.332, %transpose.333, %transpose.334, %transpose.335, %transpose.336, %p91, %transpose.337, %p93, %transpose.338, %transpose.339, %transpose.340, %transpose.341, %transpose.342, %transpose.343, %transpose.344, %transpose.345, %p102, %transpose.346, %p104, %transpose.347, %transpose.348, %transpose.349, %transpose.350, %transpose.351, %transpose.352, %tr... to 512 characters in the compiler's debug metadata +Invoking RemoveOptimizationBarriers pass + +2025-08-07T13:57:08Z INFO 50115 [job.HLOToTensorizer.0]: IR signature: 5bb2cda84f89e3e556843403ea05d6d67130299dc9a1fbfc964c0d386a78e543 for sg0000/HLOToTensorizer +2025-08-07T13:57:08Z INFO 50115 [job.HLOToTensorizer.0]: Job #0 finished +2025-08-07T13:57:08Z INFO 50115 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-08-07T13:57:08Z INFO 50115 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-08-07T13:57:08Z INFO 50115 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-08-07T13:57:08Z INFO 50115 [job.Frontend.0]: Processing input #0 +2025-08-07T13:57:08Z INFO 50115 [job.Frontend.0]: Start model loading +2025-08-07T13:57:08Z INFO 50115 [job.Frontend.0]: Start tensorization +2025-08-07T13:57:09Z INFO 50115 [job.Frontend.0]: Num jobs: 128 +2025-08-07T13:57:09Z USER 50115 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-08-07T13:57:09Z INFO 50115 [Tensorizer]: Frontend did not find netlist info. Switching to flat flow. +2025-08-07T13:57:09Z INFO 50115 [Tensorizer]: Building model from Penguin script "penguin.py"... +2025-08-07T13:57:09Z INFO 50115 [Tensorizer]: Tensorizer options: --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=matmult-bf16 --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.004 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.006 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.037 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.014 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.005 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.003 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.038 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.049 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.019 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.004 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.005 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.004 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.004 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.004 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.004 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.071 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.004 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.004 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.004 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.007 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.004 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.004 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.004 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.077 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.006 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.007 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.014 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.002 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.003 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Rematerialization]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.003 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.002 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.002 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.002 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.001 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LICM]: LICM finished after 0.001 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.003 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.001 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.001 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.001 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LICM]: LICM finished after 0.001 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.001 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.002 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.003 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.001 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LICM]: LICM finished after 0.001 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.001 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.001 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.003 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.002 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-08-07T13:57:09Z INFO 50115 [Tensorizer]: After optimization: 325 statements +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.003 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AutoCastTCInputs]: Running AutoCastTCInputs +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AutoCastTCInputs]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/AutoCastTCInputs]: AutoCastTCInputs finished after 0.003 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.002 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.002 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.002 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.002 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.003 seconds +2025-08-07T13:57:09Z INFO 50115 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.036 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.005 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=False) +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.008 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.003 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.002 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.021 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.006 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.034 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.022 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.042 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.005 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.056 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.003 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=False) +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.003 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.029 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=False) +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.003 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=False) +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.008 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=False) +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.004 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.001 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=False) +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.025 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 0.092 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.003 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.001 seconds +2025-08-07T13:57:10Z INFO 50115 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-08-07T13:57:28Z INFO 50115 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=True) +2025-08-07T13:57:28Z INFO 50115 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 18.641 seconds +2025-08-07T13:57:28Z INFO 50115 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 18.886 seconds +2025-08-07T13:57:28Z INFO 50115 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.207 seconds +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.121 seconds +2025-08-07T13:57:29Z INFO 50115 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.822 seconds +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.123 seconds +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/LICM]: LICM finished after 0.035 seconds +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.028 seconds +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.089 seconds +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.093 seconds +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.187 seconds +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: transpose_128x128 +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.034 seconds +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.129 seconds +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=False) +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.064 seconds +2025-08-07T13:57:30Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.130 seconds +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.027 seconds +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.097 seconds +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=False) +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.022 seconds +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=False) +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.007 seconds +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.081 seconds +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.095 seconds +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 0.428 seconds +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.131 seconds +2025-08-07T13:57:31Z INFO 50115 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=False) +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.082 seconds +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/LICM]: LICM finished after 0.036 seconds +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 0.545 seconds +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.041 seconds +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.049 seconds +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.026 seconds +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.022 seconds +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.017 seconds +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.090 seconds +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.022 seconds +2025-08-07T13:57:32Z INFO 50115 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:57:33Z INFO 50115 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:57:33Z INFO 50115 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.083 seconds +2025-08-07T13:57:33Z INFO 50115 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:57:33Z INFO 50115 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:57:33Z INFO 50115 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.114 seconds +2025-08-07T13:57:33Z INFO 50115 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 1.587 seconds +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.045 seconds +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.020 seconds +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.030 seconds +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.011 seconds +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.010 seconds +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.001 seconds +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.020 seconds +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=False) +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.009 seconds +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=False) +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.010 seconds +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.081 seconds +2025-08-07T13:57:34Z INFO 50115 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.005 seconds +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=False) +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.151 seconds +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.047 seconds +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.487 seconds +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.018 seconds +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.127 seconds +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.015 seconds +2025-08-07T13:57:35Z INFO 50115 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/SpillPSum]: Finished (changed=False) +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.150 seconds +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.018 seconds +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.015 seconds +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.103 seconds +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.074 seconds +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.174 seconds +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.015 seconds +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.142 seconds +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.038 seconds +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.017 seconds +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.017 seconds +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.033 seconds +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.060 seconds +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.011 seconds +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.032 seconds +2025-08-07T13:57:36Z INFO 50115 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 3.204 seconds +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.066 seconds +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.018 seconds +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.017 seconds +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.465% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'20894.27130'[T_i0,T_i2_29578,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 128, 2, 3072) %'input8'[T_i0,i0.128,T_i2_29578,i1.3072] # id=25058, src_id=None, , instances=64 # dl = tensor_op_name: t2534_pftranspose_20894 | hlo_id: 1787 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.465% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'20935.27144'[T_i0,T_i2_29586,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 128, 2, 3072) %'input19'[T_i0,i0.128,T_i2_29586,i1.3072] # id=25116, src_id=None, , instances=64 # dl = tensor_op_name: t2597_pftranspose_20935 | hlo_id: 1805 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.465% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'20976.27158'[T_i0,T_i2_29594,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 128, 2, 3072) %'input30'[T_i0,i0.128,T_i2_29594,i1.3072] # id=25174, src_id=None, , instances=64 # dl = tensor_op_name: t2660_pftranspose_20976 | hlo_id: 1823 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.465% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'21017.27172'[T_i0,T_i2_29602,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 128, 2, 3072) %'input41'[T_i0,i0.128,T_i2_29602,i1.3072] # id=25232, src_id=None, , instances=64 # dl = tensor_op_name: t2723_pftranspose_21017 | hlo_id: 1841 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.465% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'21058.27186'[T_i0,T_i2_29610,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 128, 2, 3072) %'input52'[T_i0,i0.128,T_i2_29610,i1.3072] # id=25290, src_id=None, , instances=64 # dl = tensor_op_name: t2786_pftranspose_21058 | hlo_id: 1859 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.465% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'21099.27200'[T_i0,T_i2_29618,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 128, 2, 3072) %'input63'[T_i0,i0.128,T_i2_29618,i1.3072] # id=25348, src_id=None, , instances=64 # dl = tensor_op_name: t2849_pftranspose_21099 | hlo_id: 1877 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.465% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'21140.27214'[T_i0,T_i2_29626,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 128, 2, 3072) %'input74'[T_i0,i0.128,T_i2_29626,i1.3072] # id=25406, src_id=None, , instances=64 # dl = tensor_op_name: t2912_pftranspose_21140 | hlo_id: 1895 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.465% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'21181.27228'[T_i0,T_i2_29634,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 128, 2, 3072) %'input85'[T_i0,i0.128,T_i2_29634,i1.3072] # id=25464, src_id=None, , instances=64 # dl = tensor_op_name: t2975_pftranspose_21181 | hlo_id: 1913 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.465% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'21222.27242'[T_i0,T_i2_29642,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 128, 2, 3072) %'input96'[T_i0,i0.128,T_i2_29642,i1.3072] # id=25522, src_id=None, , instances=64 # dl = tensor_op_name: t3038_pftranspose_21222 | hlo_id: 1931 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.465% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'21263.27256'[T_i0,T_i2_29650,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 128, 2, 3072) %'input107'[T_i0,i0.128,T_i2_29650,i1.3072] # id=25580, src_id=None, , instances=64 # dl = tensor_op_name: t3101_pftranspose_21263 | hlo_id: 1949 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.033 seconds +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=False) +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.017 seconds +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.355 seconds +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-08-07T13:57:40Z WARNING 50115 [sg0000/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 100.00 percent of all matmul computation +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.041 seconds +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.154 seconds +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.039 seconds +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.045 seconds +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.001 seconds +2025-08-07T13:57:40Z INFO 50115 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-08-07T13:57:41Z INFO 50115 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-08-07T13:57:41Z INFO 50115 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.659 seconds +2025-08-07T13:57:42Z INFO 50115 [Tensorizer]: BirCodeGen estimate #instances=279978 in sg0000 +2025-08-07T13:57:42Z INFO 50115 [Tensorizer]: IR signature: 4c500c33f6b410247d09546b05e57cdd552637593e5e9cae706f41ffd3eaadab for nc00/sg0000/TensorizerBIR +2025-08-07T13:57:42Z INFO 50115 [Tensorizer]: Weights total number of bytes: 131072 +2025-08-07T13:57:42Z INFO 50115 [Tensorizer]: Successfully built model. +2025-08-07T13:57:42Z USER 50115 [root/Tensorizer/Tensorizer]: Tensorizer finished after 33.117 seconds +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: End tensorization +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input0 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input1 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input2 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input3 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input4 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input5 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input6 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input7 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input8 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input9 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input10 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input11 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input12 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input13 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input14 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input15 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input16 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input17 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input18 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input19 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input20 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input21 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input22 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input23 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input24 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input25 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input26 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input27 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input28 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input29 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input30 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input31 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input32 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input33 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input34 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input35 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input36 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input37 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input38 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input39 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input40 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input41 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input42 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input43 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input44 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input45 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input46 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input47 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input48 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input49 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input50 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input51 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input52 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input53 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input54 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input55 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input56 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input57 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input58 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input59 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input60 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input61 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input62 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input63 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input64 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input65 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input66 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input67 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input68 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input69 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input70 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input71 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input72 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input73 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input74 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input75 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input76 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input77 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input78 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input79 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input80 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input81 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input82 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input83 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input84 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input85 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input86 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input87 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input88 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input89 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input90 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input91 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input92 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input93 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input94 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input95 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input96 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input97 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input98 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input99 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input100 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input101 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input102 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input103 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input104 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input105 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input106 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input107 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input108 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input109 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input110 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input111 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input112 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input113 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input114 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input115 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input116 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input117 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input118 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input119 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input120 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input121 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input122 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input123 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input124 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input125 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input126 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input127 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input128 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input129 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input130 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input131 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input132 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input133 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input134 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input135 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input136 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input137 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input138 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input139 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input140 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input141 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input142 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input143 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input144 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input145 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input146 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input147 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input148 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input149 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input150 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input151 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input152 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input153 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input154 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input155 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input156 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input157 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input158 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input159 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input160 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input161 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input162 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input163 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input164 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input165 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input166 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input167 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input168 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input169 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input170 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input171 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input172 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input173 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input174 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input175 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input176 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input177 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input178 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input179 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input180 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input181 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input182 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input183 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input184 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input185 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input186 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input187 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input188 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input189 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input190 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input191 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input192 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input193 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input194 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input195 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input196 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input197 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input198 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input199 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input200 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input201 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input202 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input203 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input204 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input205 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input206 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input207 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input208 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input209 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input210 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input211 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input212 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input213 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input214 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input215 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input216 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input217 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input218 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input219 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input220 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input221 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input222 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input223 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input224 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input225 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input226 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input227 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input228 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input229 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input230 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input231 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input232 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input233 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input234 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input235 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input236 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input237 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input238 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input239 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input240 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input241 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input242 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input243 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input244 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input245 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input246 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input247 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input248 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input249 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input250 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input251 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input252 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input253 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input254 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input255 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input256 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input257 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input258 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input259 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input260 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input261 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input262 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input263 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input264 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input265 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input266 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input267 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input268 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input269 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input270 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input271 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input272 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input273 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input274 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input275 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input276 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input277 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input278 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input279 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input280 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input281 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input282 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input283 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input284 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input285 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input286 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input287 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input288 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input289 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input290 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input291 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input292 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input293 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input294 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input295 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input296 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input297 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input298 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input299 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input300 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input301 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input302 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input303 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input304 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input305 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input306 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input307 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input308 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input309 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input310 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input311 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input312 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input313 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input314 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input315 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input316 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input317 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input318 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input319 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input320 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input321 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input322 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input323 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input324 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input325 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input326 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input327 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input328 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input329 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input330 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input331 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input332 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input333 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input334 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input335 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input336 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input337 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input338 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input339 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input340 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input341 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input342 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input343 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input344 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input345 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input346 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input347 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input348 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input349 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input350 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input351 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input352 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input353 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input354 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input355 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input356 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input357 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input358 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input359 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input360 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input361 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input362 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input363 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input364 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input365 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input366 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input367 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input368 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input369 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input370 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input371 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input372 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input373 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input374 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input375 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input376 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input377 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input378 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input379 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input380 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input381 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input382 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input383 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input384 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input385 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input386 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input387 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input388 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input389 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input390 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input391 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input392 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input393 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input394 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input395 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input396 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input397 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Network input: input398 +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: wrote bir.json +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: wrote tensor_map.json +2025-08-07T13:57:42Z INFO 50115 [job.Frontend.0]: Job #0 finished +2025-08-07T13:57:42Z INFO 50115 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-08-07T13:57:42Z INFO 50115 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-08-07T13:57:42Z INFO 50115 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-08-07T13:57:42Z INFO 50115 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-08-07T13:57:42Z INFO 50115 [job.WalrusDriver.0]: BackendDriver has 1 states with 1 core LNC +2025-08-07T13:57:42Z INFO 50115 [job.WalrusDriver.0]: BackendDriver: no partitions found. Switching to flat flow. +2025-08-07T13:57:42Z INFO 50115 [job.WalrusDriver.0]: Job WalrusDriver len(in_states) 1 +2025-08-07T13:57:42Z INFO 50115 [job.WalrusDriver.0]: Processing input #0 +2025-08-07T13:57:42Z INFO 50115 [job.WalrusDriver.0]: BackendDriver in_state.num_states 1 with 1 core LNC +2025-08-07T13:57:42Z INFO 50115 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/layout_opt/log-neuron-cc.txt --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-color-rotation --max-load-lower-bound 0.14 --mm-reorder-opt --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json --unified-backend-and-legacy-codegen --tensor-map tensor_map.json --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --dge-levels scalar_dynamic_offset,io,vector_dynamic_offsets --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/layout_opt/graph.neff +2025-08-07T13:57:42Z INFO 50115 [job.WalrusDriver.0]: Working directory is /home/ubuntu/neuronxcc-6q5tifbo/sg00 +2025-08-07T13:57:42Z INFO 50115 [job.WalrusDriver.0]: propagate_exit=True +2025-08-07T13:57:42Z INFO 50115 [job.WalrusDriver.0]: use_logger=False +2025-08-07T13:57:42Z INFO 50115 [job.WalrusDriver.0]: expose_stderr=True +2025-08-07T13:57:42Z INFO 50185 [Logging]: Logging to ../../qwen3/layout_opt/log-neuron-cc.txt at level 'INFO' +2025-08-07T13:57:42Z INFO 50185 [BackendDriver]: max_allowed_parallelism=128 +2025-08-07T13:57:42Z INFO 50185 [BackendDriver]: Backend driver mtBackend: false numModules: 1 Cwd: "/home/ubuntu/neuronxcc-6q5tifbo/sg00" +2025-08-07T13:57:42Z INFO 50185 [BackendDriver]: DynamicDMA is enabled +2025-08-07T13:57:42Z INFO 50185 [BackendDriver]: DynamicDMA levels being enabled: io, scalar_dynamic_offset, vector_dynamic_offsets, +2025-08-07T13:57:42Z USER 50185 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:57:42Z INFO 50185 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=1776 blocks=1 instructions=869 Max writers: 1 Max Readers: 325 +2025-08-07T13:57:42Z USER 50185 [ModuleForkPass]: Running do_nothing +2025-08-07T13:57:42Z INFO 50185 [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=1776 blocks=1 instructions=869 Max writers: 1 Max Readers: 325 +2025-08-07T13:57:42Z USER 50185 [ModuleForkPass]: do_nothing finished after 0.003 seconds +2025-08-07T13:57:42Z INFO 50185 [ModuleForkPass]: curr_vmrss: 176mb, ru_maxrss: 429mb (delta=0mb) +2025-08-07T13:57:42Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1776 memory location(s), 1 block(s), and 869 instruction(s). Max writers: 1 Max Readers: 325 +2025-08-07T13:57:42Z USER 50185 [ModuleForkPass]: Running birverifier +2025-08-07T13:57:42Z INFO 50185 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=1776 blocks=1 instructions=869 Max writers: 1 Max Readers: 325 +2025-08-07T13:57:43Z USER 50185 [ModuleForkPass]: birverifier finished after 0.303 seconds +2025-08-07T13:57:43Z INFO 50185 [ModuleForkPass]: curr_vmrss: 944mb, ru_maxrss: 944mb (delta=515mb) +2025-08-07T13:57:43Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1776 memory location(s), 1 block(s), and 869 instruction(s). Max writers: 1 Max Readers: 325 +2025-08-07T13:57:43Z USER 50185 [BackendPassManager]: mod_parallel_pass finished after 0.315 seconds +2025-08-07T13:57:43Z INFO 50185 [BackendPassManager]: curr_vmrss: 944mb, ru_maxrss: 944mb (delta=515mb) +2025-08-07T13:57:43Z INFO 50185 [BackendPassManager]: Output has 1 module(s), 1 function(s), 1776 memory location(s), 1 block(s), and 869 instruction(s). Max writers: 1 Max Readers: 325 +2025-08-07T13:57:43Z USER 50185 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:57:43Z INFO 50185 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=1776 blocks=1 instructions=869 Max writers: 1 Max Readers: 325 +2025-08-07T13:57:43Z USER 50185 [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:57:43Z INFO 50185 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=1776 blocks=1 instructions=869 Max writers: 1 Max Readers: 325 +2025-08-07T13:57:43Z USER 50185 [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:57:43Z INFO 50185 [SubgraphForkPass]: curr_vmrss: 944mb, ru_maxrss: 944mb (delta=0mb) +2025-08-07T13:57:43Z INFO 50185 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 1776 memory location(s), 1 block(s), and 869 instruction(s). Max writers: 1 Max Readers: 325 +2025-08-07T13:57:43Z USER 50185 [BackendPassManager]: subgraph_parallel_pass finished after 0.004 seconds +2025-08-07T13:57:43Z INFO 50185 [BackendPassManager]: curr_vmrss: 944mb, ru_maxrss: 944mb (delta=0mb) +2025-08-07T13:57:43Z INFO 50185 [BackendPassManager]: Output has 1 module(s), 1 function(s), 1776 memory location(s), 1 block(s), and 869 instruction(s). Max writers: 1 Max Readers: 325 +2025-08-07T13:57:43Z USER 50185 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:57:43Z INFO 50185 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=1776 blocks=1 instructions=869 Max writers: 1 Max Readers: 325 +2025-08-07T13:57:43Z USER 50185 [ModuleForkPass]: Running expand_replication +2025-08-07T13:57:43Z INFO 50185 [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=1776 blocks=1 instructions=869 Max writers: 1 Max Readers: 325 +2025-08-07T13:57:43Z INFO 50185 [ExpandReplication]: Found 0 replicated matmults +2025-08-07T13:57:43Z USER 50185 [ModuleForkPass]: expand_replication finished after 0.001 seconds +2025-08-07T13:57:43Z INFO 50185 [ModuleForkPass]: curr_vmrss: 944mb, ru_maxrss: 944mb (delta=0mb) +2025-08-07T13:57:43Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1776 memory location(s), 1 block(s), and 869 instruction(s). Max writers: 1 Max Readers: 325 +2025-08-07T13:57:43Z USER 50185 [ModuleForkPass]: Running unroll +2025-08-07T13:57:43Z INFO 50185 [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=1776 blocks=1 instructions=869 Max writers: 1 Max Readers: 325 +2025-08-07T13:57:43Z INFO 50185 [Unroll]: INFO (Unroll) Start unrolling at Thu Aug 7 13:57:43 2025 +2025-08-07T13:57:45Z INFO 50185 [Unroll]: INFO (Unroll) DONE unrolling Thu Aug 7 13:57:43 2025 + +2025-08-07T13:57:45Z INFO 50185 [Unroll]: sg0000 Instruction count after Unroll: +2025-08-07T13:57:45Z INFO 50185 [Unroll]: Total count: 279653 +2025-08-07T13:57:45Z INFO 50185 [Unroll]: Matmult: 212041 +2025-08-07T13:57:45Z INFO 50185 [Unroll]: GenericCopy: 53065 +2025-08-07T13:57:45Z INFO 50185 [Unroll]: Load: 7274 +2025-08-07T13:57:45Z INFO 50185 [Unroll]: Save: 7273 +2025-08-07T13:57:45Z INFO 50185 [Unroll]: Unrolled DGE count with Dynamic AP: 0 +2025-08-07T13:57:45Z USER 50185 [ModuleForkPass]: unroll finished after 2.777 seconds +2025-08-07T13:57:45Z INFO 50185 [ModuleForkPass]: curr_vmrss: 2492mb, ru_maxrss: 2492mb (delta=1548mb) +2025-08-07T13:57:45Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 69168 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:45Z USER 50185 [BackendPassManager]: mod_parallel_pass finished after 2.845 seconds +2025-08-07T13:57:45Z INFO 50185 [BackendPassManager]: curr_vmrss: 1645mb, ru_maxrss: 2492mb (delta=1548mb) +2025-08-07T13:57:45Z INFO 50185 [BackendPassManager]: Output has 1 module(s), 1 function(s), 69168 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:45Z USER 50185 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:57:45Z INFO 50185 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=69168 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:45Z USER 50185 [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:57:45Z INFO 50185 [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=69168 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z INFO 50185 [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:57:46Z INFO 50185 [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:57:46Z INFO 50185 [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:57:46Z INFO 50185 [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:57:46Z USER 50185 [SubgraphForkPass]: dead_code_elim finished after 0.379 seconds +2025-08-07T13:57:46Z INFO 50185 [SubgraphForkPass]: curr_vmrss: 1668mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:46Z INFO 50185 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z USER 50185 [BackendPassManager]: subgraph_parallel_pass finished after 0.395 seconds +2025-08-07T13:57:46Z INFO 50185 [BackendPassManager]: curr_vmrss: 1668mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:46Z INFO 50185 [BackendPassManager]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z USER 50185 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:57:46Z INFO 50185 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z USER 50185 [ModuleForkPass]: Running birverifier +2025-08-07T13:57:46Z INFO 50185 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z USER 50185 [ModuleForkPass]: birverifier finished after 0.313 seconds +2025-08-07T13:57:46Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1670mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:46Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z USER 50185 [BackendPassManager]: mod_parallel_pass finished after 0.327 seconds +2025-08-07T13:57:46Z INFO 50185 [BackendPassManager]: curr_vmrss: 1670mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:46Z INFO 50185 [BackendPassManager]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z USER 50185 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:57:46Z INFO 50185 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z USER 50185 [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:57:46Z INFO 50185 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z USER 50185 [SubgraphForkPass]: lnc_verifier finished after 0.006 seconds +2025-08-07T13:57:46Z INFO 50185 [SubgraphForkPass]: curr_vmrss: 1670mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:46Z INFO 50185 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z USER 50185 [BackendPassManager]: subgraph_parallel_pass finished after 0.018 seconds +2025-08-07T13:57:46Z INFO 50185 [BackendPassManager]: curr_vmrss: 1670mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:46Z INFO 50185 [BackendPassManager]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z USER 50185 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:57:46Z INFO 50185 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z USER 50185 [ModuleForkPass]: Running instruction_reorder +2025-08-07T13:57:46Z INFO 50185 [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z USER 50185 [ModuleForkPass]: instruction_reorder finished after 0.050 seconds +2025-08-07T13:57:46Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1671mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:46Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z USER 50185 [ModuleForkPass]: Running psum_legalization +2025-08-07T13:57:46Z INFO 50185 [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z USER 50185 [ModuleForkPass]: psum_legalization finished after 0.034 seconds +2025-08-07T13:57:46Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1671mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:46Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z USER 50185 [ModuleForkPass]: Running legalize_cce_dma +2025-08-07T13:57:46Z INFO 50185 [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z USER 50185 [ModuleForkPass]: legalize_cce_dma finished after 0.031 seconds +2025-08-07T13:57:46Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1671mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:46Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z USER 50185 [ModuleForkPass]: Running error_injector +2025-08-07T13:57:46Z INFO 50185 [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z WARNING 50185 [ErrorInjector]: Unrecognized injected error value "0" +2025-08-07T13:57:46Z USER 50185 [ModuleForkPass]: error_injector finished after 0.006 seconds +2025-08-07T13:57:46Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1671mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:46Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z USER 50185 [ModuleForkPass]: Running vn_splitter +2025-08-07T13:57:46Z INFO 50185 [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:46Z INFO 50185 [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-08-07T13:57:46Z INFO 50185 [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-08-07T13:57:47Z INFO 50185 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-08-07T13:57:47Z INFO 50185 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-08-07T13:57:47Z INFO 50185 [VNSplitterPass]: INFO (VNSplitter) Time: 0.006 seconds +2025-08-07T13:57:47Z INFO 50185 [VNSplitterPass]: INFO (VerticalFusion) Time: 0.077 seconds +2025-08-07T13:57:47Z INFO 50185 [VNSplitterPass]: INFO (ShrinkDN) Time: 0.083 seconds +2025-08-07T13:57:47Z USER 50185 [ModuleForkPass]: vn_splitter finished after 0.226 seconds +2025-08-07T13:57:47Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1681mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:47Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:47Z USER 50185 [ModuleForkPass]: Running constant_propagate +2025-08-07T13:57:47Z INFO 50185 [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:47Z INFO 50185 [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-08-07T13:57:47Z INFO 50185 [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:57:47Z INFO 50185 [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:57:47Z INFO 50185 [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:57:47Z INFO 50185 [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:57:47Z INFO 50185 [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-08-07T13:57:47Z INFO 50185 [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:57:47Z INFO 50185 [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:57:47Z INFO 50185 [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:57:47Z INFO 50185 [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:57:47Z USER 50185 [ModuleForkPass]: constant_propagate finished after 0.724 seconds +2025-08-07T13:57:47Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1684mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:47Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:47Z USER 50185 [ModuleForkPass]: Running lower_ac +2025-08-07T13:57:47Z INFO 50185 [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:47Z INFO 50185 [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-08-07T13:57:47Z USER 50185 [ModuleForkPass]: lower_ac finished after 0.046 seconds +2025-08-07T13:57:47Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1684mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:47Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:47Z USER 50185 [ModuleForkPass]: Running input_dma_coalescing +2025-08-07T13:57:47Z INFO 50185 [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:47Z INFO 50185 [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-08-07T13:57:47Z USER 50185 [ModuleForkPass]: input_dma_coalescing finished after 0.109 seconds +2025-08-07T13:57:47Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1684mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:47Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:47Z USER 50185 [ModuleForkPass]: Running remat_optimization +2025-08-07T13:57:47Z INFO 50185 [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:48Z INFO 50185 [RematOpt]: Removed 0 remat instructions +2025-08-07T13:57:48Z USER 50185 [ModuleForkPass]: remat_optimization finished after 0.189 seconds +2025-08-07T13:57:48Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1686mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:48Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:48Z USER 50185 [ModuleForkPass]: Running early_peephole_opts +2025-08-07T13:57:48Z INFO 50185 [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:48Z INFO 50185 [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-08-07T13:57:48Z INFO 50185 [EarlyPeepholeOpts]: Activation Accumulate: 0 +2025-08-07T13:57:48Z USER 50185 [ModuleForkPass]: early_peephole_opts finished after 0.090 seconds +2025-08-07T13:57:48Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1686mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:48Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:48Z USER 50185 [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-08-07T13:57:48Z INFO 50185 [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:48Z USER 50185 [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.026 seconds +2025-08-07T13:57:48Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1686mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:48Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:48Z USER 50185 [ModuleForkPass]: Running infer_stream_ids +2025-08-07T13:57:48Z INFO 50185 [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:48Z USER 50185 [ModuleForkPass]: infer_stream_ids finished after 0.026 seconds +2025-08-07T13:57:48Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1686mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:48Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:48Z USER 50185 [ModuleForkPass]: Running pre_sched +2025-08-07T13:57:48Z INFO 50185 [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:48Z INFO 50185 [PreSched]: Start PRE scheduling 2 cores: 1 at: Thu Aug 7 13:57:48 2025 +2025-08-07T13:57:48Z INFO 50185 [LayerSpiller]: LayerSpill: Start... +2025-08-07T13:57:48Z INFO 50185 [LayerSpiller]: LayerSpill: Found 0 Splits CCs +2025-08-07T13:57:48Z INFO 50185 [LayerSpiller]: Grouped CCs to 0 clusters. +2025-08-07T13:57:48Z INFO 50185 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-08-07T13:57:48Z INFO 50185 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-08-07T13:57:48Z INFO 50185 [LayerSpiller]: LayerSpill: Done. +2025-08-07T13:57:48Z INFO 50185 [PreSched]: Start split live ranges Thu Aug 7 13:57:48 2025 +2025-08-07T13:57:48Z INFO 50185 [PreSched]: Num_Splits: 0 +2025-08-07T13:57:48Z INFO 50185 [PreSched]: End split live ranges Thu Aug 7 13:57:48 2025 +2025-08-07T13:57:48Z INFO 50185 [PreSched]: Strt remove redundncies Thu Aug 7 13:57:48 2025 +2025-08-07T13:57:48Z INFO 50185 [PreSched]: remove_redundant_memsets +2025-08-07T13:57:48Z INFO 50185 [PreSched]: remove_redundant_memsets: 0 +2025-08-07T13:57:48Z INFO 50185 [PreSched]: remove_redundant_loads +2025-08-07T13:57:48Z INFO 50185 [PreSched]: remove_redundant_loads: 0 +2025-08-07T13:57:48Z INFO 50185 [PreSched]: End remove redundncies Thu Aug 7 13:57:48 2025 +2025-08-07T13:57:48Z INFO 50185 [PreSched]: Start DCE Thu Aug 7 13:57:48 2025 +2025-08-07T13:57:48Z INFO 50185 [PreSched]: eliminateDeadStore removed 0 instructions +2025-08-07T13:57:48Z INFO 50185 [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:57:48Z INFO 50185 [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:57:48Z INFO 50185 [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:57:48Z INFO 50185 [PreSched]: End DCE Thu Aug 7 13:57:48 2025 +2025-08-07T13:57:49Z INFO 50185 [PreSched]: Start build flow dependencies Thu Aug 7 13:57:49 2025 +2025-08-07T13:57:49Z INFO 50185 [build_flow_deps]: Start build fdeps. Invocation: 1Thu Aug 7 13:57:49 2025 +2025-08-07T13:57:49Z INFO 50185 [build_flow_deps]: Allocs: 68412 instructions: 279653 +2025-08-07T13:57:50Z INFO 50185 [build_flow_deps]: Build fdeps inserted 698765 edges +2025-08-07T13:57:50Z INFO 50185 [build_flow_deps]: Done build fdeps 698765 Thu Aug 7 13:57:50 2025 +2025-08-07T13:57:50Z INFO 50185 [PreSched]: End build flow dependencies Thu Aug 7 13:57:50 2025 +2025-08-07T13:57:50Z INFO 50185 [PreSched]: Start remove useless insts Thu Aug 7 13:57:50 2025 +2025-08-07T13:57:50Z INFO 50185 [PreSched]: remove_useless_insts +2025-08-07T13:57:50Z INFO 50185 [PreSched]: remove Useless Instructions: 0 +2025-08-07T13:57:50Z INFO 50185 [PreSched]: End remove useless insts Thu Aug 7 13:57:50 2025 +2025-08-07T13:57:50Z INFO 50185 [PreSched]: Start scratchpad optimization Thu Aug 7 13:57:50 2025 +2025-08-07T13:57:50Z INFO 50185 [PreSched]: End scratchpad optimization Thu Aug 7 13:57:50 2025 +2025-08-07T13:57:50Z INFO 50185 [PreSched]: DONE PRE scheduling Thu Aug 7 13:57:50 2025 +2025-08-07T13:57:50Z USER 50185 [ModuleForkPass]: pre_sched finished after 2.285 seconds +2025-08-07T13:57:50Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1816mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:50Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:50Z USER 50185 [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:57:50Z INFO 50185 [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:50Z INFO 50185 [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:57:50Z INFO 50185 [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:57:50Z INFO 50185 [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:57:50Z INFO 50185 [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:57:51Z INFO 50185 [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:57:51Z USER 50185 [ModuleForkPass]: tensor_copy_elim finished after 0.450 seconds +2025-08-07T13:57:51Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1816mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:51Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:51Z USER 50185 [ModuleForkPass]: Running dynamic_dma_setup +2025-08-07T13:57:51Z INFO 50185 [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:51Z USER 50185 [ModuleForkPass]: dynamic_dma_setup finished after 0.006 seconds +2025-08-07T13:57:51Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1816mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:51Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68413 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:51Z USER 50185 [ModuleForkPass]: Running runtime_memory_reservation +2025-08-07T13:57:51Z INFO 50185 [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=68413 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:51Z USER 50185 [ModuleForkPass]: runtime_memory_reservation finished after 0.006 seconds +2025-08-07T13:57:51Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1816mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:51Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68413 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:51Z USER 50185 [ModuleForkPass]: Running coloring_allocator_psum +2025-08-07T13:57:51Z INFO 50185 [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=68413 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:51Z INFO 50185 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:57:51Z INFO 50185 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: allocating PSUM +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: main loop +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: renumber locations +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: size = 53065 +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: build_no_bitmap start +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: 100% PSUM demand before spilling +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: found 171648 edges +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: mean: 6.46935 +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: median: 6.99995 +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: adjacency vectors require 1373184 bytes +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: build_no_bitmap done +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: find costs +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: simplify interference graph +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: initialize low and high +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: lo = 53065 +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: hi = 0 +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: inf = 0 +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: total = 53065 +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: simplify +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: new candidates = 0 +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: select ranges +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: no more spills +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-08-07T13:57:51Z INFO 50185 [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-08-07T13:57:51Z USER 50185 [ModuleForkPass]: coloring_allocator_psum finished after 0.662 seconds +2025-08-07T13:57:51Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1832mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:51Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68413 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:51Z USER 50185 [ModuleForkPass]: Running dma_optimization_psum +2025-08-07T13:57:51Z INFO 50185 [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=68413 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:51Z INFO 50185 [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-08-07T13:57:51Z INFO 50185 [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-08-07T13:57:51Z USER 50185 [ModuleForkPass]: dma_optimization_psum finished after 0.234 seconds +2025-08-07T13:57:51Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1832mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:51Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68413 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:51Z USER 50185 [ModuleForkPass]: Running address_rotation_psum +2025-08-07T13:57:51Z INFO 50185 [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=68413 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:52Z INFO 50185 [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-08-07T13:57:53Z INFO 50185 [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-08-07T13:57:53Z INFO 50185 [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-08-07T13:57:53Z USER 50185 [ModuleForkPass]: address_rotation_psum finished after 1.951 seconds +2025-08-07T13:57:53Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1835mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:53Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68413 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:53Z USER 50185 [ModuleForkPass]: Running coloring_allocator_sb +2025-08-07T13:57:53Z INFO 50185 [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=68413 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:54Z INFO 50185 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 6946398208 +2025-08-07T13:57:54Z INFO 50185 [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 7517 bytes +2025-08-07T13:57:54Z INFO 50185 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 6946365440 +2025-08-07T13:57:54Z INFO 50185 [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 7461 bytes +2025-08-07T13:57:54Z INFO 50185 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0 +2025-08-07T13:57:54Z INFO 50185 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes +2025-08-07T13:57:54Z INFO 50185 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:57:54Z INFO 50185 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: allocating SB +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: main loop +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: renumber locations +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: size = 14548 +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: find partners +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: found 53065 accumulation groups +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: largest = 22342.27111_i383 +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: tensors = 2 +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: requires 8448 bytes/partition +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: expanding partners +2025-08-07T13:57:54Z INFO 50185 []: find first defs for local +2025-08-07T13:57:54Z INFO 50185 []: find first defs for global +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: find loads +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: 1 pin count +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: 6121 remat count +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: build interference graph +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: pass 1 int-tree +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: Num intervals 14548 Num locations 14548 +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: info.neighbors init Done +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: edge: 32260 +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: mean: 4.43497 +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: median: 2.00048 +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: find costs +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: simplify interference graph +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: safe = 14546 +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: unsafe = 1 +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: inf = 0 +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: total = 14547 +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: simplify +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 14548 +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: new candidates = 0 +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: select ranges +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: Total: 14547 +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: Spilled: 0.000 (0) +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: Allocated: 1.000 (14547) +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: Rover zone: 0.988 (14367) +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: Pre-rover zone: 0.010 (144) +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: Post-rover zone: 0.002 (36) +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: Slice zone: 0.000 (0) +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: Blocks medium: 0.000 (0) +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: Blocks tall: 1.000 (14547) +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: Visited until tall blocking (mean): 0.996 +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:57:54Z INFO 50185 [SB_Allocator]: Success +2025-08-07T13:57:55Z INFO 50185 [SB_Allocator]: SB spills = 0 tensors +2025-08-07T13:57:55Z INFO 50185 [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:57:55Z INFO 50185 [SB_Allocator]: remats = 0 tensors +2025-08-07T13:57:55Z INFO 50185 [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:57:55Z INFO 50185 [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:57:55Z INFO 50185 [SB_Allocator]: SB score = 0 +2025-08-07T13:57:55Z INFO 50185 [SB_Allocator]: spilling from SB cost about 0 cycles +2025-08-07T13:57:55Z INFO 50185 [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-08-07T13:57:55Z INFO 50185 [SB_Allocator]: pinning saved approximately 9010 cycles +2025-08-07T13:57:55Z INFO 50185 [SB_Allocator]: 0% SB utilization after allocation +2025-08-07T13:57:55Z INFO 50185 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 6946398208 +2025-08-07T13:57:55Z INFO 50185 [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 7517 bytes +2025-08-07T13:57:55Z INFO 50185 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 6946365440 +2025-08-07T13:57:55Z INFO 50185 [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 7461 bytes +2025-08-07T13:57:55Z INFO 50185 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0 +2025-08-07T13:57:55Z INFO 50185 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes +2025-08-07T13:57:55Z USER 50185 [ModuleForkPass]: coloring_allocator_sb finished after 1.175 seconds +2025-08-07T13:57:55Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1840mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:55Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68413 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:55Z USER 50185 [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:57:55Z INFO 50185 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=68413 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:55Z INFO 50185 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:57:55Z USER 50185 [ModuleForkPass]: address_rotation_sb finished after 0.333 seconds +2025-08-07T13:57:55Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1842mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:55Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68413 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:55Z USER 50185 [ModuleForkPass]: Running dma_optimization_sb +2025-08-07T13:57:55Z INFO 50185 [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=68413 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:55Z INFO 50185 [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 13892763648, 50.0001% input load, 49.9999% output write, 0% spill/reload [sg0000] +2025-08-07T13:57:55Z INFO 50185 [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:57:55Z INFO 50185 [DMAOptimizationBase]: removed 0 identical load +2025-08-07T13:57:55Z INFO 50185 [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:57:55Z INFO 50185 [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:57:55Z INFO 50185 [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-08-07T13:57:55Z INFO 50185 [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-08-07T13:57:56Z INFO 50185 [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-08-07T13:57:56Z INFO 50185 [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(6.9464e+09) +2025-08-07T13:57:56Z INFO 50185 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:57:56Z INFO 50185 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:57:56Z INFO 50185 [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, -nan% out of total spill/reload dma traffic +2025-08-07T13:57:56Z INFO 50185 [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-08-07T13:57:56Z INFO 50185 [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-08-07T13:57:56Z INFO 50185 [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, -nan% out of total spill/reload dma traffic +2025-08-07T13:57:56Z INFO 50185 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:57:56Z INFO 50185 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:57:56Z INFO 50185 [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, -nan% out of total spill/reload dma traffic +2025-08-07T13:57:56Z INFO 50185 [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-08-07T13:57:56Z INFO 50185 [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-08-07T13:57:56Z INFO 50185 [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-08-07T13:57:56Z INFO 50185 [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: average loaded DMA size 7517 bytes +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: average saved DMA size 7461 bytes +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 6946398208 +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 7517 bytes +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 6946365440 +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 7461 bytes +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, -nan% out of total spill/reload dma traffic +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 0, 0% out of total dma traffic +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 13892763648, 50.0001% input load, 49.9999% output write, 0% spill/reload [sg0000] +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 6946398208 +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 7517 bytes +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 6946365440 +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 7461 bytes +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 0 +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 0 bytes +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 7488 bytes +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-08-07T13:57:57Z USER 50185 [ModuleForkPass]: dma_optimization_sb finished after 2.090 seconds +2025-08-07T13:57:57Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1862mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:57Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:57Z USER 50185 [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:57:57Z INFO 50185 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:57Z INFO 50185 [DMAOptimizationBase]: SB Rotation rotated 5962 Sb address +2025-08-07T13:57:58Z INFO 50185 [DMAOptimizationBase]: SB Rotation rotated 4811 Sb address +2025-08-07T13:57:58Z INFO 50185 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:57:58Z INFO 50185 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:57:59Z INFO 50185 [DMAOptimizationBase]: SB Rotation rotated 2052 Sb address +2025-08-07T13:57:59Z INFO 50185 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:57:59Z USER 50185 [ModuleForkPass]: address_rotation_sb finished after 1.914 seconds +2025-08-07T13:57:59Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1862mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:59Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:59Z USER 50185 [ModuleForkPass]: Running coloring_allocator_dram +2025-08-07T13:57:59Z INFO 50185 [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:59Z INFO 50185 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:57:59Z INFO 50185 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: reserved space = 16382119936 bytes +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: spill space = 0 bytes +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: aligned spill space = 0 bytes +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: renumber locations +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: size = 0 +2025-08-07T13:57:59Z INFO 50185 []: find first defs for local +2025-08-07T13:57:59Z INFO 50185 []: find first defs for global +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: simplify interference graph +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: initialize low and high +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: lo = 0 +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: hi = 0 +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: total = 0 +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: simplify +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: select ranges +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: Real CC buffer size 0 +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-08-07T13:57:59Z INFO 50185 [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:57:59Z USER 50185 [ModuleForkPass]: coloring_allocator_dram finished after 0.452 seconds +2025-08-07T13:57:59Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1863mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:57:59Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:59Z USER 50185 [ModuleForkPass]: Running address_rotation_dram +2025-08-07T13:57:59Z INFO 50185 [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:57:59Z INFO 50185 [DMAOptimizationBase]: Runtime page size at 512MB +2025-08-07T13:58:00Z INFO 50185 [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-08-07T13:58:00Z INFO 50185 [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-08-07T13:58:00Z INFO 50185 [DMAOptimizationBase]: allreduce hwm 0 +2025-08-07T13:58:00Z INFO 50185 [DMAOptimizationBase]: Real CC buffer size 0 +2025-08-07T13:58:00Z INFO 50185 [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-08-07T13:58:00Z INFO 50185 [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-08-07T13:58:00Z USER 50185 [ModuleForkPass]: address_rotation_dram finished after 0.223 seconds +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1865mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:00Z USER 50185 [ModuleForkPass]: Running tensorcopy_accel +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:00Z INFO 50185 [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-08-07T13:58:00Z INFO 50185 [TensorCopyAccel::Impl]: Accelerated 0 out of 53065 tensorcopy in Function: sg0000 average acceleration factor: -nan +2025-08-07T13:58:00Z USER 50185 [ModuleForkPass]: tensorcopy_accel finished after 0.035 seconds +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1865mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:00Z USER 50185 [ModuleForkPass]: Running peephole_opts +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:00Z INFO 50185 [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-08-07T13:58:00Z USER 50185 [ModuleForkPass]: peephole_opts finished after 0.100 seconds +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1865mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:00Z USER 50185 [ModuleForkPass]: Running lower_kernel +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:00Z INFO 50185 [LowerKernel]: Started running LowerKernel +2025-08-07T13:58:00Z INFO 50185 [LowerKernel]: Start of kernel lowering pass, number of insts: 279653, number of allocs: 68412 +2025-08-07T13:58:00Z INFO 50185 [LowerKernel]: Scan BKs time (s): 0.020931 +2025-08-07T13:58:00Z INFO 50185 [LowerKernel]: Lower BKs time (s): 1e-05 +2025-08-07T13:58:00Z USER 50185 [ModuleForkPass]: lower_kernel finished after 0.029 seconds +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1865mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:00Z USER 50185 [ModuleForkPass]: Running lower_nki_kernel +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:00Z USER 50185 [ModuleForkPass]: lower_nki_kernel finished after 0.026 seconds +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1865mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:00Z USER 50185 [ModuleForkPass]: Running dynamic_dma_cleanup +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:00Z USER 50185 [ModuleForkPass]: dynamic_dma_cleanup finished after 0.042 seconds +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1867mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:00Z USER 50185 [ModuleForkPass]: Running birverifier +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:00Z USER 50185 [ModuleForkPass]: birverifier finished after 0.258 seconds +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1867mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:00Z USER 50185 [ModuleForkPass]: Running dynamic_dma_scan +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:00Z USER 50185 [ModuleForkPass]: dynamic_dma_scan finished after 0.042 seconds +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1867mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:00Z USER 50185 [ModuleForkPass]: Running build_fdeps +2025-08-07T13:58:00Z INFO 50185 [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:00Z INFO 50185 [build_flow_deps]: Start build fdeps. Invocation: 2Thu Aug 7 13:58:00 2025 +2025-08-07T13:58:00Z INFO 50185 [build_flow_deps]: Allocs: 68412 instructions: 279653 +2025-08-07T13:58:01Z INFO 50185 [build_flow_deps]: Build fdeps inserted 698765 edges +2025-08-07T13:58:01Z INFO 50185 [build_flow_deps]: Done build fdeps 698765 Thu Aug 7 13:58:01 2025 +2025-08-07T13:58:01Z USER 50185 [ModuleForkPass]: build_fdeps finished after 1.147 seconds +2025-08-07T13:58:01Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1892mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:01Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:01Z USER 50185 [ModuleForkPass]: Running remove_redundancies +2025-08-07T13:58:01Z INFO 50185 [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:01Z INFO 50185 [RemoveRedundancies]: remove_clobbered_writes +2025-08-07T13:58:01Z INFO 50185 [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-08-07T13:58:01Z INFO 50185 [RemoveRedundancies]: remove_useless_insts +2025-08-07T13:58:02Z INFO 50185 [RemoveRedundancies]: remove Useless Instructions: 0 +2025-08-07T13:58:02Z USER 50185 [ModuleForkPass]: remove_redundancies finished after 0.120 seconds +2025-08-07T13:58:02Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1893mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:02Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:02Z USER 50185 [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:58:02Z INFO 50185 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:02Z INFO 50185 [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:58:02Z INFO 50185 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:58:02Z INFO 50185 [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:58:02Z USER 50185 [ModuleForkPass]: anti_dependency_analyzer finished after 0.807 seconds +2025-08-07T13:58:02Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1980mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:02Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:02Z USER 50185 [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:58:02Z INFO 50185 [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:02Z INFO 50185 [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:58:03Z INFO 50185 [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:58:03Z USER 50185 [ModuleForkPass]: tensor_copy_elim finished after 0.319 seconds +2025-08-07T13:58:03Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1992mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:03Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:03Z USER 50185 [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-08-07T13:58:03Z INFO 50185 [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:03Z USER 50185 [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.007 seconds +2025-08-07T13:58:03Z INFO 50185 [ModuleForkPass]: curr_vmrss: 1992mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:03Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:03Z USER 50185 [ModuleForkPass]: Running post_sched +2025-08-07T13:58:03Z INFO 50185 [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:03Z INFO 50185 [post_scheduler]: Start PosT ScheD 3 sunda Thu Aug 7 13:58:03 2025 +2025-08-07T13:58:05Z INFO 50185 [post_scheduler]: Time-aware hwm post-sched +2025-08-07T13:58:07Z INFO 50185 [post_scheduler]: Time-aware simulation time: 58352865 +2025-08-07T13:58:08Z INFO 50185 [post_scheduler]: Done PosT ScheD Thu Aug 7 13:58:08 2025 +2025-08-07T13:58:08Z USER 50185 [ModuleForkPass]: post_sched finished after 5.266 seconds +2025-08-07T13:58:08Z INFO 50185 [ModuleForkPass]: curr_vmrss: 2383mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:08Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:08Z USER 50185 [ModuleForkPass]: Running expand_scheduling_units +2025-08-07T13:58:08Z INFO 50185 [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:08Z USER 50185 [ModuleForkPass]: expand_scheduling_units finished after 0.034 seconds +2025-08-07T13:58:08Z INFO 50185 [ModuleForkPass]: curr_vmrss: 2142mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:08Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:08Z USER 50185 [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:58:08Z INFO 50185 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:10Z INFO 50185 [DMAOptimizationBase]: PSUM Rotation rotated 10969 PSUM Banks +2025-08-07T13:58:11Z INFO 50185 [DMAOptimizationBase]: PSUM Rotation rotated 8848 PSUM Banks +2025-08-07T13:58:11Z INFO 50185 [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-08-07T13:58:12Z INFO 50185 [DMAOptimizationBase]: SB Rotation rotated 2531 Sb address +2025-08-07T13:58:12Z INFO 50185 [DMAOptimizationBase]: SB Rotation rotated 2569 Sb address +2025-08-07T13:58:12Z INFO 50185 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:58:13Z INFO 50185 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:58:13Z INFO 50185 [DMAOptimizationBase]: SB Rotation rotated 71 Sb address +2025-08-07T13:58:13Z INFO 50185 [DMAOptimizationBase]: moved 0 MM forward +2025-08-07T13:58:14Z INFO 50185 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:58:14Z INFO 50185 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:58:14Z USER 50185 [ModuleForkPass]: address_rotation_sb finished after 6.024 seconds +2025-08-07T13:58:14Z INFO 50185 [ModuleForkPass]: curr_vmrss: 2178mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:14Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:14Z USER 50185 [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:58:14Z INFO 50185 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:14Z INFO 50185 [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:58:14Z INFO 50185 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:58:14Z INFO 50185 [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:58:15Z USER 50185 [ModuleForkPass]: anti_dependency_analyzer finished after 0.856 seconds +2025-08-07T13:58:15Z INFO 50185 [ModuleForkPass]: curr_vmrss: 2205mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:15Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:15Z USER 50185 [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:58:15Z INFO 50185 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:15Z INFO 50185 [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:58:15Z INFO 50185 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-08-07T13:58:15Z INFO 50185 [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:58:15Z USER 50185 [ModuleForkPass]: anti_dependency_analyzer finished after 0.199 seconds +2025-08-07T13:58:15Z INFO 50185 [ModuleForkPass]: curr_vmrss: 2207mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:15Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:15Z USER 50185 [ModuleForkPass]: Running dep_opt +2025-08-07T13:58:15Z INFO 50185 [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:15Z INFO 50185 [build_flow_deps]: Start build fdeps. Invocation: 3Thu Aug 7 13:58:15 2025 +2025-08-07T13:58:15Z INFO 50185 [build_flow_deps]: Allocs: 68412 instructions: 279653 +2025-08-07T13:58:16Z INFO 50185 [build_flow_deps]: Build fdeps inserted 685617 edges +2025-08-07T13:58:16Z INFO 50185 [build_flow_deps]: Done build fdeps 685617 Thu Aug 7 13:58:16 2025 +2025-08-07T13:58:17Z USER 50185 [ModuleForkPass]: dep_opt finished after 1.583 seconds +2025-08-07T13:58:17Z INFO 50185 [ModuleForkPass]: curr_vmrss: 2212mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:17Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z USER 50185 [ModuleForkPass]: Running report_stats +2025-08-07T13:58:17Z INFO 50185 [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z INFO 50185 [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼────────────┤ +│ Load │ Const -> Internal │ 1 │ 32768 │ +│ Load │ ExternalInput -> Internal │ 7273 │ 6946365440 │ +│ Save │ Internal -> ExternalOutput │ 7273 │ 6946365440 │ +└─────────────┴────────────────────────────┴───────┴────────────┘ + +2025-08-07T13:58:17Z INFO 50185 [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 64 │ 73 │ +│ 256 │ 74 │ +│ 6144 │ 4608 │ +│ 8192 │ 9792 │ +└─────────────────────┴───────┘ + +2025-08-07T13:58:17Z INFO 50185 [ReportStats]: MM Stats: #MatMults 212041 #MatMult-Transposes 212041 +2025-08-07T13:58:17Z INFO 50185 [ReportStats]: IO Tensor size combined: 16382087168 +2025-08-07T13:58:17Z INFO 50185 [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ output0 │ ExternalOutput │ bfloat16 │ 622329856 │ +│ input0 │ ExternalInput │ bfloat16 │ 622329856 │ +│ output397 │ ExternalOutput │ bfloat16 │ 622329856 │ +│ input397 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input8 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input22 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input30 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input20 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input11 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input33 │ ExternalInput │ bfloat16 │ 50331648 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-08-07T13:58:17Z INFO 50185 [ReportStats]: Large (Internal) Tensor Statistics: +┌────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ t2499_pftranspose_20873_i5 │ Internal │ bfloat16 │ 1048576 │ +│ t2499_pftranspose_20873_i2 │ Internal │ bfloat16 │ 1048576 │ +│ t2499_pftranspose_20873_i1 │ Internal │ bfloat16 │ 1048576 │ +│ t2499_pftranspose_20873_i3 │ Internal │ bfloat16 │ 1048576 │ +│ t2499_pftranspose_20873_i4 │ Internal │ bfloat16 │ 1048576 │ +│ t2499_pftranspose_20873_i6 │ Internal │ bfloat16 │ 1048576 │ +│ t2499_pftranspose_20873_i9 │ Internal │ bfloat16 │ 1048576 │ +│ t2499_pftranspose_20873_i8 │ Internal │ bfloat16 │ 1048576 │ +│ t2499_pftranspose_20873_i7 │ Internal │ bfloat16 │ 1048576 │ +└────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-08-07T13:58:17Z USER 50185 [ModuleForkPass]: report_stats finished after 0.074 seconds +2025-08-07T13:58:17Z INFO 50185 [ModuleForkPass]: curr_vmrss: 2212mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:17Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z USER 50185 [BackendPassManager]: mod_parallel_pass finished after 30.649 seconds +2025-08-07T13:58:17Z INFO 50185 [BackendPassManager]: curr_vmrss: 2212mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:17Z INFO 50185 [BackendPassManager]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z USER 50185 [BackendPassManager]: Running assign_trigger_engine +2025-08-07T13:58:17Z INFO 50185 [BackendPassManager]: Inputs to assign_trigger_engine: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z INFO 50185 [AssignTriggerEngine]: Assigned trigger engine for 0 DMA instructions. Moved 0 DMA instructions to CC's engines. +2025-08-07T13:58:17Z USER 50185 [BackendPassManager]: assign_trigger_engine finished after 0.110 seconds +2025-08-07T13:58:17Z INFO 50185 [BackendPassManager]: curr_vmrss: 2213mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:17Z INFO 50185 [BackendPassManager]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z USER 50185 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:58:17Z INFO 50185 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z USER 50185 [SubgraphForkPass]: Running lower_local_collectives +2025-08-07T13:58:17Z INFO 50185 [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z USER 50185 [SubgraphForkPass]: lower_local_collectives finished after 0.006 seconds +2025-08-07T13:58:17Z INFO 50185 [SubgraphForkPass]: curr_vmrss: 2213mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:17Z INFO 50185 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z USER 50185 [SubgraphForkPass]: Running extend_shared_lifetimes +2025-08-07T13:58:17Z INFO 50185 [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z USER 50185 [SubgraphForkPass]: extend_shared_lifetimes finished after 0.006 seconds +2025-08-07T13:58:17Z INFO 50185 [SubgraphForkPass]: curr_vmrss: 2213mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:17Z INFO 50185 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z USER 50185 [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:58:17Z INFO 50185 [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z INFO 50185 [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:58:17Z USER 50185 [SubgraphForkPass]: dead_code_elim finished after 0.246 seconds +2025-08-07T13:58:17Z INFO 50185 [SubgraphForkPass]: curr_vmrss: 2213mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:17Z INFO 50185 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z USER 50185 [BackendPassManager]: subgraph_parallel_pass finished after 0.284 seconds +2025-08-07T13:58:17Z INFO 50185 [BackendPassManager]: curr_vmrss: 2213mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:17Z INFO 50185 [BackendPassManager]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z USER 50185 [BackendPassManager]: Running assign_hwdge_engine +2025-08-07T13:58:17Z INFO 50185 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z USER 50185 [BackendPassManager]: assign_hwdge_engine finished after 0.036 seconds +2025-08-07T13:58:17Z INFO 50185 [BackendPassManager]: curr_vmrss: 2213mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:17Z INFO 50185 [BackendPassManager]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z USER 50185 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:58:17Z INFO 50185 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z USER 50185 [ModuleForkPass]: Running alloc_queues +2025-08-07T13:58:17Z INFO 50185 [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z INFO 50185 [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-08-07T13:58:17Z INFO 50185 [AllocQueues]: Alloc Queue info: +┌─────────────────┬────────────────┬────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├─────────────────┼────────────────┼────────┼────────────┼──────────────────┤ +│ qSPSpillReload0 │ data │ SP │ 16 │ 1 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 14546 │ +└─────────────────┴────────────────┴────────┴────────────┴──────────────────┘ + +2025-08-07T13:58:17Z USER 50185 [ModuleForkPass]: alloc_queues finished after 0.037 seconds +2025-08-07T13:58:17Z INFO 50185 [ModuleForkPass]: curr_vmrss: 2213mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:17Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z USER 50185 [ModuleForkPass]: Running chain_dma_transposes +2025-08-07T13:58:17Z INFO 50185 [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z USER 50185 [ModuleForkPass]: chain_dma_transposes finished after 0.006 seconds +2025-08-07T13:58:17Z INFO 50185 [ModuleForkPass]: curr_vmrss: 2213mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:17Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z USER 50185 [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-08-07T13:58:17Z INFO 50185 [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z USER 50185 [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.006 seconds +2025-08-07T13:58:17Z INFO 50185 [ModuleForkPass]: curr_vmrss: 2213mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:17Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z USER 50185 [ModuleForkPass]: Running lower_control +2025-08-07T13:58:17Z INFO 50185 [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:17Z INFO 50185 [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-08-07T13:58:18Z USER 50185 [ModuleForkPass]: lower_control finished after 0.205 seconds +2025-08-07T13:58:18Z INFO 50185 [ModuleForkPass]: curr_vmrss: 2213mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:18Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:18Z USER 50185 [BackendPassManager]: mod_parallel_pass finished after 0.285 seconds +2025-08-07T13:58:18Z INFO 50185 [BackendPassManager]: curr_vmrss: 2213mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:18Z INFO 50185 [BackendPassManager]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:18Z USER 50185 [BackendPassManager]: Running nc_parallel_pass +2025-08-07T13:58:18Z INFO 50185 [BackendPassManager]: Inputs to nc_parallel_pass: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:18Z USER 50185 [CoreForkPass]: Running dep_reduction +2025-08-07T13:58:18Z INFO 50185 [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:18Z INFO 50185 [DepReduction]: Start Dependency Reduction +2025-08-07T13:58:18Z INFO 50185 [DepReduction]: Processing async instrs... +2025-08-07T13:58:18Z INFO 50185 [DepReduction]: Processing secondary edges per engine... +2025-08-07T13:58:18Z INFO 50185 [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 473602 +2025-08-07T13:58:18Z INFO 50185 [DepReduction]: Processing redundant descendants, Done. Num edges removed 486433 +2025-08-07T13:58:18Z INFO 50185 [DepReduction]: Processing async instrs, Done. Num edges removed 486433 +2025-08-07T13:58:19Z INFO 50185 [DepReduction]: Num Async removed: 0 +2025-08-07T13:58:19Z INFO 50185 [DepReduction]: Finished dependency reduction: 1150790 removed, new total 112455 +2025-08-07T13:58:19Z INFO 50185 [DepReduction]: Finished Dependency Reduction +2025-08-07T13:58:19Z USER 50185 [CoreForkPass]: dep_reduction finished after 1.655 seconds +2025-08-07T13:58:19Z INFO 50185 [CoreForkPass]: curr_vmrss: 2216mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:19Z INFO 50185 [CoreForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:19Z USER 50185 [CoreForkPass]: Running lower_dynamic_dma +2025-08-07T13:58:19Z INFO 50185 [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:19Z USER 50185 [CoreForkPass]: lower_dynamic_dma finished after 0.083 seconds +2025-08-07T13:58:19Z INFO 50185 [CoreForkPass]: curr_vmrss: 2216mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:19Z INFO 50185 [CoreForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:19Z USER 50185 [CoreForkPass]: Running legalize_dynamic_dma +2025-08-07T13:58:19Z INFO 50185 [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:19Z INFO 50185 [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 0 DGE instructions +2025-08-07T13:58:19Z INFO 50185 [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 0 DGE instructions were scanned +2025-08-07T13:58:19Z INFO 50185 [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-08-07T13:58:19Z USER 50185 [CoreForkPass]: legalize_dynamic_dma finished after 0.121 seconds +2025-08-07T13:58:19Z INFO 50185 [CoreForkPass]: curr_vmrss: 2216mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:19Z INFO 50185 [CoreForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279653 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:19Z USER 50185 [CoreForkPass]: Running lower_dma +2025-08-07T13:58:19Z INFO 50185 [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=68412 blocks=1 instructions=279653 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:20Z INFO 50185 [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 14473/14473 (100% DGE) + power-of-2 partition : 14546/14546 (100% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 14546/14546 (100% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + Spill/Reload + Copy (DGE/DMA) + 128 partition : 0/1 (0% DGE) + power-of-2 partition : 0/1 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/1 (0% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 0 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 0/0 + vector : 0/0 + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-08-07T13:58:20Z USER 50185 [CoreForkPass]: lower_dma finished after 0.157 seconds +2025-08-07T13:58:20Z INFO 50185 [CoreForkPass]: curr_vmrss: 2216mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:20Z INFO 50185 [CoreForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279661 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:20Z USER 50185 [CoreForkPass]: Running coalesce_dma_blocks +2025-08-07T13:58:20Z INFO 50185 [CoreForkPass]: Inputs to coalesce_dma_blocks: modules=1 functions=1 allocs=68412 blocks=1 instructions=279661 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:20Z INFO 50185 [CoalesceDmaBlocks]: Coaleseced 0 DMA triggers +2025-08-07T13:58:20Z USER 50185 [CoreForkPass]: coalesce_dma_blocks finished after 0.129 seconds +2025-08-07T13:58:20Z INFO 50185 [CoreForkPass]: curr_vmrss: 2218mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:20Z INFO 50185 [CoreForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279661 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:20Z USER 50185 [CoreForkPass]: Running expand_all_engine +2025-08-07T13:58:20Z INFO 50185 [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=68412 blocks=1 instructions=279661 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:20Z USER 50185 [CoreForkPass]: expand_all_engine finished after 0.050 seconds +2025-08-07T13:58:20Z INFO 50185 [CoreForkPass]: curr_vmrss: 2218mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:20Z INFO 50185 [CoreForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279661 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:20Z USER 50185 [CoreForkPass]: Running alloc_semaphores +2025-08-07T13:58:20Z INFO 50185 [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=68412 blocks=1 instructions=279661 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:20Z USER 50185 [CoreForkPass]: alloc_semaphores finished after 0.275 seconds +2025-08-07T13:58:20Z INFO 50185 [CoreForkPass]: curr_vmrss: 2218mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:20Z INFO 50185 [CoreForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279661 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:20Z USER 50185 [CoreForkPass]: Running expand_inst_late +2025-08-07T13:58:20Z INFO 50185 [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=68412 blocks=1 instructions=279661 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:20Z USER 50185 [CoreForkPass]: expand_inst_late finished after 0.262 seconds +2025-08-07T13:58:20Z INFO 50185 [CoreForkPass]: curr_vmrss: 2218mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:20Z INFO 50185 [CoreForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279661 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:20Z USER 50185 [CoreForkPass]: Running seq_inst_opt +2025-08-07T13:58:20Z INFO 50185 [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=68412 blocks=1 instructions=279661 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:20Z INFO 50185 [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:58:20Z USER 50185 [CoreForkPass]: seq_inst_opt finished after 0.037 seconds +2025-08-07T13:58:20Z INFO 50185 [CoreForkPass]: curr_vmrss: 2218mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:20Z INFO 50185 [CoreForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 279661 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:20Z USER 50185 [CoreForkPass]: Running lower_sync +2025-08-07T13:58:20Z INFO 50185 [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=68412 blocks=1 instructions=279661 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z USER 50185 [CoreForkPass]: lower_sync finished after 0.132 seconds +2025-08-07T13:58:21Z INFO 50185 [CoreForkPass]: curr_vmrss: 2218mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:21Z INFO 50185 [CoreForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 295353 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z USER 50185 [CoreForkPass]: Running lower_act +2025-08-07T13:58:21Z INFO 50185 [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=68412 blocks=1 instructions=295353 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z USER 50185 [CoreForkPass]: lower_act finished after 0.046 seconds +2025-08-07T13:58:21Z INFO 50185 [CoreForkPass]: curr_vmrss: 2218mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:21Z INFO 50185 [CoreForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 295354 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z USER 50185 [CoreForkPass]: Running lower_dve +2025-08-07T13:58:21Z INFO 50185 [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=68412 blocks=1 instructions=295354 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z INFO 50185 [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json +2025-08-07T13:58:21Z USER 50185 [CoreForkPass]: lower_dve finished after 0.309 seconds +2025-08-07T13:58:21Z INFO 50185 [CoreForkPass]: curr_vmrss: 2245mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:21Z INFO 50185 [CoreForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 295354 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z USER 50185 [CoreForkPass]: Running lower_ap +2025-08-07T13:58:21Z INFO 50185 [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=68412 blocks=1 instructions=295354 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z USER 50185 [CoreForkPass]: lower_ap finished after 0.063 seconds +2025-08-07T13:58:21Z INFO 50185 [CoreForkPass]: curr_vmrss: 2105mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:21Z INFO 50185 [CoreForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 295354 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z USER 50185 [CoreForkPass]: Running coloring_allocator_reg +2025-08-07T13:58:21Z INFO 50185 [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=68412 blocks=1 instructions=295354 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z INFO 50185 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:58:21Z INFO 50185 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:58:21Z INFO 50185 [REG_Allocator]: allocating REG +2025-08-07T13:58:21Z INFO 50185 [REG_Allocator]: main loop iteration 1 +2025-08-07T13:58:21Z USER 50185 [CoreForkPass]: coloring_allocator_reg finished after 0.050 seconds +2025-08-07T13:58:21Z INFO 50185 [CoreForkPass]: curr_vmrss: 2117mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:21Z INFO 50185 [CoreForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 295354 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z USER 50185 [BackendPassManager]: nc_parallel_pass finished after 3.543 seconds +2025-08-07T13:58:21Z INFO 50185 [BackendPassManager]: curr_vmrss: 2117mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:21Z INFO 50185 [BackendPassManager]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 295354 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z USER 50185 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:58:21Z INFO 50185 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=68412 blocks=1 instructions=295354 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z USER 50185 [ModuleForkPass]: Running birverifier +2025-08-07T13:58:21Z INFO 50185 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=68412 blocks=1 instructions=295354 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z USER 50185 [ModuleForkPass]: birverifier finished after 0.285 seconds +2025-08-07T13:58:21Z INFO 50185 [ModuleForkPass]: curr_vmrss: 2119mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:21Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 295354 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z USER 50185 [BackendPassManager]: mod_parallel_pass finished after 0.300 seconds +2025-08-07T13:58:21Z INFO 50185 [BackendPassManager]: curr_vmrss: 2119mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:21Z INFO 50185 [BackendPassManager]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 295354 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z USER 50185 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:58:21Z INFO 50185 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=68412 blocks=1 instructions=295354 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z USER 50185 [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:58:21Z INFO 50185 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=68412 blocks=1 instructions=295354 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z USER 50185 [SubgraphForkPass]: lnc_verifier finished after 0.006 seconds +2025-08-07T13:58:21Z INFO 50185 [SubgraphForkPass]: curr_vmrss: 2119mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:21Z INFO 50185 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 295354 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z USER 50185 [BackendPassManager]: subgraph_parallel_pass finished after 0.018 seconds +2025-08-07T13:58:21Z INFO 50185 [BackendPassManager]: curr_vmrss: 2119mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:21Z INFO 50185 [BackendPassManager]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 295354 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z USER 50185 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:58:21Z INFO 50185 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=68412 blocks=1 instructions=295354 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z USER 50185 [ModuleForkPass]: Running codegen +2025-08-07T13:58:21Z INFO 50185 [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=68412 blocks=1 instructions=295354 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:21Z INFO 50185 [Codegen]: Total compiler allocated DRAM tensors: 0 GB +2025-08-07T13:58:21Z INFO 50185 [Codegen]: Total un-allocated DRAM tensors by kind: +2025-08-07T13:58:21Z INFO 50185 [Codegen]: +┌───────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├───────────────┼─────────────┤ +│ ExternalInput │ 7.6285 │ +│ Const │ 3.05176e-05 │ +└───────────────┴─────────────┘ + +2025-08-07T13:58:21Z INFO 50185 [Codegen]: Total runtime managed DRAM tensors: 7.62853 GB +2025-08-07T13:58:23Z INFO 50185 [Codegen]: Instruction Stats: +2025-08-07T13:58:23Z INFO 50185 [Codegen]: +┌─────────────────────┬────────┐ +│ Opcode │ Count │ +├─────────────────────┼────────┤ +│ LDWEIGHTS │ 212041 │ +│ MATMUL │ 212041 │ +│ ACTIVATE │ 53065 │ +│ EVENT_SEMAPHORE │ 15692 │ +│ UNKNOWN(0xd4) │ 14546 │ +│ NOP │ 7 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ ACT_TABLE_LOAD │ 1 │ +│ PSEUDO_DMA_TRIGGER │ 1 │ +└─────────────────────┴────────┘ + +2025-08-07T13:58:23Z INFO 50185 [Codegen]: +┌────────────┬────────┐ +│ Engine │ Count │ +├────────────┼────────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 21233 │ +│ Scalar │ 55905 │ +│ Tensor │ 430261 │ +│ SyncDMA │ 0 │ +│ Vector │ 2 │ +│ Sync │ 3 │ +│ All │ 0 │ +└────────────┴────────┘ + +2025-08-07T13:58:23Z INFO 50185 [Codegen]: Total instructions: 507404 (0.0302436 GB) +2025-08-07T13:58:23Z INFO 50185 [Codegen]: Total DynamicDMA instruction count: 14546 +2025-08-07T13:58:23Z USER 50185 [Codegen]: isa_gen finished after 1.099 seconds +2025-08-07T13:58:23Z INFO 50185 [Codegen]: Number of DMA descriptors on each queue instance: +┌─────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├─────────────────┼────────────────┤ +│ qSPSpillReload0 │ 256 │ +└─────────────────┴────────────────┘ + +Total descriptors: 256 (3.8147e-06 GB) +2025-08-07T13:58:23Z INFO 50185 [Codegen]: Number of DMA engines used by each queue: +┌─────────────────┬─────────────────────┐ +│ Queue │ DMA Engines │ +├─────────────────┼─────────────────────┤ +│ qSPSpillReload0 │ 16 │ +│ qPoolDynamic │ 16 │ +├─────────────────┼─────────────────────┤ +│ TOTAL │ 32 (must be <= 176) │ +└─────────────────┴─────────────────────┘ + +2025-08-07T13:58:23Z INFO 50185 [Codegen]: Tensors with largest descriptor count: +┌──────────────────────┬──────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├──────────────────────┼──────────┼──────────┼──────────────────┤ +│ identity_local_25028 │ Internal │ bfloat16 │ 1 │ +│ identity_25026 │ Const │ bfloat16 │ 1 │ +└──────────────────────┴──────────┴──────────┴──────────────────┘ + +2025-08-07T13:58:23Z USER 50185 [Codegen]: dma_desc_gen finished after 0.000 seconds +2025-08-07T13:58:23Z INFO 50185 [Codegen]: Estimated peak DRAM usage: 7.65878 GB +2025-08-07T13:58:23Z INFO 50185 [Codegen]: Generating debug info +2025-08-07T13:58:23Z USER 50185 [Codegen]: debug_info_gen finished after 0.545 seconds +2025-08-07T13:58:23Z USER 50185 [ModuleForkPass]: codegen finished after 1.704 seconds +2025-08-07T13:58:23Z INFO 50185 [ModuleForkPass]: curr_vmrss: 2310mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:23Z INFO 50185 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 295354 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:23Z USER 50185 [BackendPassManager]: mod_parallel_pass finished after 1.736 seconds +2025-08-07T13:58:23Z INFO 50185 [BackendPassManager]: curr_vmrss: 2134mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:23Z INFO 50185 [BackendPassManager]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 295354 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:23Z USER 50185 [BackendPassManager]: Running neff_packager +2025-08-07T13:58:23Z INFO 50185 [BackendPassManager]: Inputs to neff_packager: modules=1 functions=1 allocs=68412 blocks=1 instructions=295354 Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:23Z WARNING 50185 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.169490.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-08-07T13:58:23Z INFO 50185 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/layout_opt/graph.neff +2025-08-07T13:58:23Z INFO 50185 [NeffFileWriter]: IR signature: c6cb604c4535169891036e23b5114d01 for neff artifacts +2025-08-07T13:58:23Z USER 50185 [BackendPassManager]: neff_packager finished after 0.312 seconds +2025-08-07T13:58:23Z INFO 50185 [BackendPassManager]: curr_vmrss: 2134mb, ru_maxrss: 2492mb (delta=0mb) +2025-08-07T13:58:24Z INFO 50185 [BackendPassManager]: Output has 1 module(s), 1 function(s), 68412 memory location(s), 1 block(s), and 295354 instruction(s). Max writers: 64 Max Readers: 212041 +2025-08-07T13:58:24Z INFO 50185 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ module │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc00 │ module │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.000000 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.000000 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.000000 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.000000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-08-07T13:58:24Z INFO 50185 [BackendDriver]: Backend completed successfully, tearing down. +2025-08-07T13:58:24Z INFO 50115 [job.WalrusDriver.0]: Job #0 finished +2025-08-07T13:58:24Z INFO 50115 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-08-07T13:58:24Z INFO 50115 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-08-07T13:58:24Z INFO 50115 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/layout_opt/model/graph.hlo"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/home/ubuntu/neuronxcc-6q5tifbo/sg00", "state_id": "sg00"}' --pipeline BIRLinker +2025-08-07T13:58:24Z INFO 50115 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/neuronxcc-6q5tifbo +2025-08-07T13:58:24Z INFO 50115 [job.BIRLinker.0]: Linking not needed. Netlist doesnt exist +2025-08-07T13:58:24Z INFO 50115 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-08-07T13:58:24Z INFO 50115 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-08-07T13:58:24Z INFO 50115 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-08-07T13:58:24Z INFO 50115 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-08-07T13:58:24Z INFO 50115 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-08-07T13:58:24Z INFO 50115 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-08-07T13:58:24Z INFO 50115 [job.NeffWrapper.0]: Processing input #0 +2025-08-07T13:58:24Z INFO 50115 [job.NeffWrapper.0]: Start NeffWrapper +2025-08-07T13:58:24Z INFO 50115 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/layout_opt/model/graph.hlo --neff /home/ubuntu/qwen3/layout_opt/graph.neff --io_transposes /home/ubuntu/neuronxcc-6q5tifbo/io_transposes.json --output /home/ubuntu/qwen3/layout_opt/wrapped_neff.hlo --netlist /home/ubuntu/neuronxcc-6q5tifbo/hlo_netlist.json +2025-08-07T13:58:24Z INFO 50115 [job.NeffWrapper.0]: Could not open file: /home/ubuntu/neuronxcc-6q5tifbo/hlo_netlist.json +There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-08-07T13:58:24Z INFO 50115 [job.NeffWrapper.0]: Job #0 finished +2025-08-07T13:58:24Z INFO 50115 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-08-07T13:58:24Z INFO 50115 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-08-07T13:58:24Z INFO 50115 [pipeline.Pipeline.0]: Job #0 finished +2025-08-07T13:58:24Z INFO 50051 [root]: Subcommand returned with exitcode=0 diff --git a/layout_opt/metaneff b/layout_opt/metaneff new file mode 100644 index 0000000000000000000000000000000000000000..55b9aefcc326af47292f379b8b0d71bbeb535b74 --- /dev/null +++ b/layout_opt/metaneff @@ -0,0 +1,1198 @@ + +( +input0�� �2embed_tokens.weight8 +; +input1� �2'layers.0.self_attn.o_proj.o_proj.weight8 += +input2�� 2)layers.0.self_attn.qkv_proj.v_proj.weight8 +1 +input3� 2layers.0.input_layernorm.weight8 +7 +input4�2%layers.0.self_attn.k_layernorm.weight8 += +input5�� 2)layers.0.self_attn.qkv_proj.k_proj.weight8 +7 +input6�2%layers.0.self_attn.q_layernorm.weight8 += +input7�� 2)layers.0.self_attn.qkv_proj.q_proj.weight8 +1 +input8� �02layers.0.mlp.down_proj.weight8 +/ +input9�0� 2layers.0.mlp.up_proj.weight8 +; +input10� 2(layers.0.post_attention_layernorm.weight8 +2 +input11�0� 2layers.0.mlp.gate_proj.weight8 +< +input12� �2'layers.1.self_attn.o_proj.o_proj.weight8 +> +input13�� 2)layers.1.self_attn.qkv_proj.v_proj.weight8 +2 +input14� 2layers.1.input_layernorm.weight8 +8 +input15�2%layers.1.self_attn.k_layernorm.weight8 +> +input16�� 2)layers.1.self_attn.qkv_proj.k_proj.weight8 +8 +input17�2%layers.1.self_attn.q_layernorm.weight8 +> +input18�� 2)layers.1.self_attn.qkv_proj.q_proj.weight8 +2 +input19� �02layers.1.mlp.down_proj.weight8 +0 +input20�0� 2layers.1.mlp.up_proj.weight8 +; +input21� 2(layers.1.post_attention_layernorm.weight8 +2 +input22�0� 2layers.1.mlp.gate_proj.weight8 +< +input23� �2'layers.2.self_attn.o_proj.o_proj.weight8 +> +input24�� 2)layers.2.self_attn.qkv_proj.v_proj.weight8 +2 +input25� 2layers.2.input_layernorm.weight8 +8 +input26�2%layers.2.self_attn.k_layernorm.weight8 +> +input27�� 2)layers.2.self_attn.qkv_proj.k_proj.weight8 +8 +input28�2%layers.2.self_attn.q_layernorm.weight8 +> +input29�� 2)layers.2.self_attn.qkv_proj.q_proj.weight8 +2 +input30� �02layers.2.mlp.down_proj.weight8 +0 +input31�0� 2layers.2.mlp.up_proj.weight8 +; +input32� 2(layers.2.post_attention_layernorm.weight8 +2 +input33�0� 2layers.2.mlp.gate_proj.weight8 +< +input34� �2'layers.3.self_attn.o_proj.o_proj.weight8 +> +input35�� 2)layers.3.self_attn.qkv_proj.v_proj.weight8 +2 +input36� 2layers.3.input_layernorm.weight8 +8 +input37�2%layers.3.self_attn.k_layernorm.weight8 +> +input38�� 2)layers.3.self_attn.qkv_proj.k_proj.weight8 +8 +input39�2%layers.3.self_attn.q_layernorm.weight8 +> +input40�� 2)layers.3.self_attn.qkv_proj.q_proj.weight8 +2 +input41� �02layers.3.mlp.down_proj.weight8 +0 +input42�0� 2layers.3.mlp.up_proj.weight8 +; +input43� 2(layers.3.post_attention_layernorm.weight8 +2 +input44�0� 2layers.3.mlp.gate_proj.weight8 +< +input45� �2'layers.4.self_attn.o_proj.o_proj.weight8 +> +input46�� 2)layers.4.self_attn.qkv_proj.v_proj.weight8 +2 +input47� 2layers.4.input_layernorm.weight8 +8 +input48�2%layers.4.self_attn.k_layernorm.weight8 +> +input49�� 2)layers.4.self_attn.qkv_proj.k_proj.weight8 +8 +input50�2%layers.4.self_attn.q_layernorm.weight8 +> +input51�� 2)layers.4.self_attn.qkv_proj.q_proj.weight8 +2 +input52� �02layers.4.mlp.down_proj.weight8 +0 +input53�0� 2layers.4.mlp.up_proj.weight8 +; +input54� 2(layers.4.post_attention_layernorm.weight8 +2 +input55�0� 2layers.4.mlp.gate_proj.weight8 +< +input56� �2'layers.5.self_attn.o_proj.o_proj.weight8 +> +input57�� 2)layers.5.self_attn.qkv_proj.v_proj.weight8 +2 +input58� 2layers.5.input_layernorm.weight8 +8 +input59�2%layers.5.self_attn.k_layernorm.weight8 +> +input60�� 2)layers.5.self_attn.qkv_proj.k_proj.weight8 +8 +input61�2%layers.5.self_attn.q_layernorm.weight8 +> +input62�� 2)layers.5.self_attn.qkv_proj.q_proj.weight8 +2 +input63� �02layers.5.mlp.down_proj.weight8 +0 +input64�0� 2layers.5.mlp.up_proj.weight8 +; +input65� 2(layers.5.post_attention_layernorm.weight8 +2 +input66�0� 2layers.5.mlp.gate_proj.weight8 +< +input67� �2'layers.6.self_attn.o_proj.o_proj.weight8 +> +input68�� 2)layers.6.self_attn.qkv_proj.v_proj.weight8 +2 +input69� 2layers.6.input_layernorm.weight8 +8 +input70�2%layers.6.self_attn.k_layernorm.weight8 +> +input71�� 2)layers.6.self_attn.qkv_proj.k_proj.weight8 +8 +input72�2%layers.6.self_attn.q_layernorm.weight8 +> +input73�� 2)layers.6.self_attn.qkv_proj.q_proj.weight8 +2 +input74� �02layers.6.mlp.down_proj.weight8 +0 +input75�0� 2layers.6.mlp.up_proj.weight8 +; +input76� 2(layers.6.post_attention_layernorm.weight8 +2 +input77�0� 2layers.6.mlp.gate_proj.weight8 +< +input78� �2'layers.7.self_attn.o_proj.o_proj.weight8 +> +input79�� 2)layers.7.self_attn.qkv_proj.v_proj.weight8 +2 +input80� 2layers.7.input_layernorm.weight8 +8 +input81�2%layers.7.self_attn.k_layernorm.weight8 +> +input82�� 2)layers.7.self_attn.qkv_proj.k_proj.weight8 +8 +input83�2%layers.7.self_attn.q_layernorm.weight8 +> +input84�� 2)layers.7.self_attn.qkv_proj.q_proj.weight8 +2 +input85� �02layers.7.mlp.down_proj.weight8 +0 +input86�0� 2layers.7.mlp.up_proj.weight8 +; +input87� 2(layers.7.post_attention_layernorm.weight8 +2 +input88�0� 2layers.7.mlp.gate_proj.weight8 +< +input89� �2'layers.8.self_attn.o_proj.o_proj.weight8 +> +input90�� 2)layers.8.self_attn.qkv_proj.v_proj.weight8 +2 +input91� 2layers.8.input_layernorm.weight8 +8 +input92�2%layers.8.self_attn.k_layernorm.weight8 +> +input93�� 2)layers.8.self_attn.qkv_proj.k_proj.weight8 +8 +input94�2%layers.8.self_attn.q_layernorm.weight8 +> +input95�� 2)layers.8.self_attn.qkv_proj.q_proj.weight8 +2 +input96� �02layers.8.mlp.down_proj.weight8 +0 +input97�0� 2layers.8.mlp.up_proj.weight8 +; +input98� 2(layers.8.post_attention_layernorm.weight8 +2 +input99�0� 2layers.8.mlp.gate_proj.weight8 += +input100� �2'layers.9.self_attn.o_proj.o_proj.weight8 +? +input101�� 2)layers.9.self_attn.qkv_proj.v_proj.weight8 +3 +input102� 2layers.9.input_layernorm.weight8 +9 +input103�2%layers.9.self_attn.k_layernorm.weight8 +? +input104�� 2)layers.9.self_attn.qkv_proj.k_proj.weight8 +9 +input105�2%layers.9.self_attn.q_layernorm.weight8 +? +input106�� 2)layers.9.self_attn.qkv_proj.q_proj.weight8 +3 +input107� �02layers.9.mlp.down_proj.weight8 +1 +input108�0� 2layers.9.mlp.up_proj.weight8 +< +input109� 2(layers.9.post_attention_layernorm.weight8 +3 +input110�0� 2layers.9.mlp.gate_proj.weight8 +> +input111� �2(layers.10.self_attn.o_proj.o_proj.weight8 +@ +input112�� 2*layers.10.self_attn.qkv_proj.v_proj.weight8 +4 +input113� 2 layers.10.input_layernorm.weight8 +: +input114�2&layers.10.self_attn.k_layernorm.weight8 +@ +input115�� 2*layers.10.self_attn.qkv_proj.k_proj.weight8 +: +input116�2&layers.10.self_attn.q_layernorm.weight8 +@ +input117�� 2*layers.10.self_attn.qkv_proj.q_proj.weight8 +4 +input118� �02layers.10.mlp.down_proj.weight8 +2 +input119�0� 2layers.10.mlp.up_proj.weight8 += +input120� 2)layers.10.post_attention_layernorm.weight8 +4 +input121�0� 2layers.10.mlp.gate_proj.weight8 +> +input122� �2(layers.11.self_attn.o_proj.o_proj.weight8 +@ +input123�� 2*layers.11.self_attn.qkv_proj.v_proj.weight8 +4 +input124� 2 layers.11.input_layernorm.weight8 +: +input125�2&layers.11.self_attn.k_layernorm.weight8 +@ +input126�� 2*layers.11.self_attn.qkv_proj.k_proj.weight8 +: +input127�2&layers.11.self_attn.q_layernorm.weight8 +@ +input128�� 2*layers.11.self_attn.qkv_proj.q_proj.weight8 +4 +input129� �02layers.11.mlp.down_proj.weight8 +2 +input130�0� 2layers.11.mlp.up_proj.weight8 += +input131� 2)layers.11.post_attention_layernorm.weight8 +4 +input132�0� 2layers.11.mlp.gate_proj.weight8 +> +input133� �2(layers.12.self_attn.o_proj.o_proj.weight8 +@ +input134�� 2*layers.12.self_attn.qkv_proj.v_proj.weight8 +4 +input135� 2 layers.12.input_layernorm.weight8 +: +input136�2&layers.12.self_attn.k_layernorm.weight8 +@ +input137�� 2*layers.12.self_attn.qkv_proj.k_proj.weight8 +: +input138�2&layers.12.self_attn.q_layernorm.weight8 +@ +input139�� 2*layers.12.self_attn.qkv_proj.q_proj.weight8 +4 +input140� �02layers.12.mlp.down_proj.weight8 +2 +input141�0� 2layers.12.mlp.up_proj.weight8 += +input142� 2)layers.12.post_attention_layernorm.weight8 +4 +input143�0� 2layers.12.mlp.gate_proj.weight8 +> +input144� �2(layers.13.self_attn.o_proj.o_proj.weight8 +@ +input145�� 2*layers.13.self_attn.qkv_proj.v_proj.weight8 +4 +input146� 2 layers.13.input_layernorm.weight8 +: +input147�2&layers.13.self_attn.k_layernorm.weight8 +@ +input148�� 2*layers.13.self_attn.qkv_proj.k_proj.weight8 +: +input149�2&layers.13.self_attn.q_layernorm.weight8 +@ +input150�� 2*layers.13.self_attn.qkv_proj.q_proj.weight8 +4 +input151� �02layers.13.mlp.down_proj.weight8 +2 +input152�0� 2layers.13.mlp.up_proj.weight8 += +input153� 2)layers.13.post_attention_layernorm.weight8 +4 +input154�0� 2layers.13.mlp.gate_proj.weight8 +> +input155� �2(layers.14.self_attn.o_proj.o_proj.weight8 +@ +input156�� 2*layers.14.self_attn.qkv_proj.v_proj.weight8 +4 +input157� 2 layers.14.input_layernorm.weight8 +: +input158�2&layers.14.self_attn.k_layernorm.weight8 +@ +input159�� 2*layers.14.self_attn.qkv_proj.k_proj.weight8 +: +input160�2&layers.14.self_attn.q_layernorm.weight8 +@ +input161�� 2*layers.14.self_attn.qkv_proj.q_proj.weight8 +4 +input162� �02layers.14.mlp.down_proj.weight8 +2 +input163�0� 2layers.14.mlp.up_proj.weight8 += +input164� 2)layers.14.post_attention_layernorm.weight8 +4 +input165�0� 2layers.14.mlp.gate_proj.weight8 +> +input166� �2(layers.15.self_attn.o_proj.o_proj.weight8 +@ +input167�� 2*layers.15.self_attn.qkv_proj.v_proj.weight8 +4 +input168� 2 layers.15.input_layernorm.weight8 +: +input169�2&layers.15.self_attn.k_layernorm.weight8 +@ +input170�� 2*layers.15.self_attn.qkv_proj.k_proj.weight8 +: +input171�2&layers.15.self_attn.q_layernorm.weight8 +@ +input172�� 2*layers.15.self_attn.qkv_proj.q_proj.weight8 +4 +input173� �02layers.15.mlp.down_proj.weight8 +2 +input174�0� 2layers.15.mlp.up_proj.weight8 += +input175� 2)layers.15.post_attention_layernorm.weight8 +4 +input176�0� 2layers.15.mlp.gate_proj.weight8 +> +input177� �2(layers.16.self_attn.o_proj.o_proj.weight8 +@ +input178�� 2*layers.16.self_attn.qkv_proj.v_proj.weight8 +4 +input179� 2 layers.16.input_layernorm.weight8 +: +input180�2&layers.16.self_attn.k_layernorm.weight8 +@ +input181�� 2*layers.16.self_attn.qkv_proj.k_proj.weight8 +: +input182�2&layers.16.self_attn.q_layernorm.weight8 +@ +input183�� 2*layers.16.self_attn.qkv_proj.q_proj.weight8 +4 +input184� �02layers.16.mlp.down_proj.weight8 +2 +input185�0� 2layers.16.mlp.up_proj.weight8 += +input186� 2)layers.16.post_attention_layernorm.weight8 +4 +input187�0� 2layers.16.mlp.gate_proj.weight8 +> +input188� �2(layers.17.self_attn.o_proj.o_proj.weight8 +@ +input189�� 2*layers.17.self_attn.qkv_proj.v_proj.weight8 +4 +input190� 2 layers.17.input_layernorm.weight8 +: +input191�2&layers.17.self_attn.k_layernorm.weight8 +@ +input192�� 2*layers.17.self_attn.qkv_proj.k_proj.weight8 +: +input193�2&layers.17.self_attn.q_layernorm.weight8 +@ +input194�� 2*layers.17.self_attn.qkv_proj.q_proj.weight8 +4 +input195� �02layers.17.mlp.down_proj.weight8 +2 +input196�0� 2layers.17.mlp.up_proj.weight8 += +input197� 2)layers.17.post_attention_layernorm.weight8 +4 +input198�0� 2layers.17.mlp.gate_proj.weight8 +> +input199� �2(layers.18.self_attn.o_proj.o_proj.weight8 +@ +input200�� 2*layers.18.self_attn.qkv_proj.v_proj.weight8 +4 +input201� 2 layers.18.input_layernorm.weight8 +: +input202�2&layers.18.self_attn.k_layernorm.weight8 +@ +input203�� 2*layers.18.self_attn.qkv_proj.k_proj.weight8 +: +input204�2&layers.18.self_attn.q_layernorm.weight8 +@ +input205�� 2*layers.18.self_attn.qkv_proj.q_proj.weight8 +4 +input206� �02layers.18.mlp.down_proj.weight8 +2 +input207�0� 2layers.18.mlp.up_proj.weight8 += +input208� 2)layers.18.post_attention_layernorm.weight8 +4 +input209�0� 2layers.18.mlp.gate_proj.weight8 +> +input210� �2(layers.19.self_attn.o_proj.o_proj.weight8 +@ +input211�� 2*layers.19.self_attn.qkv_proj.v_proj.weight8 +4 +input212� 2 layers.19.input_layernorm.weight8 +: +input213�2&layers.19.self_attn.k_layernorm.weight8 +@ +input214�� 2*layers.19.self_attn.qkv_proj.k_proj.weight8 +: +input215�2&layers.19.self_attn.q_layernorm.weight8 +@ +input216�� 2*layers.19.self_attn.qkv_proj.q_proj.weight8 +4 +input217� �02layers.19.mlp.down_proj.weight8 +2 +input218�0� 2layers.19.mlp.up_proj.weight8 += +input219� 2)layers.19.post_attention_layernorm.weight8 +4 +input220�0� 2layers.19.mlp.gate_proj.weight8 +> +input221� �2(layers.20.self_attn.o_proj.o_proj.weight8 +@ +input222�� 2*layers.20.self_attn.qkv_proj.v_proj.weight8 +4 +input223� 2 layers.20.input_layernorm.weight8 +: +input224�2&layers.20.self_attn.k_layernorm.weight8 +@ +input225�� 2*layers.20.self_attn.qkv_proj.k_proj.weight8 +: +input226�2&layers.20.self_attn.q_layernorm.weight8 +@ +input227�� 2*layers.20.self_attn.qkv_proj.q_proj.weight8 +4 +input228� �02layers.20.mlp.down_proj.weight8 +2 +input229�0� 2layers.20.mlp.up_proj.weight8 += +input230� 2)layers.20.post_attention_layernorm.weight8 +4 +input231�0� 2layers.20.mlp.gate_proj.weight8 +> +input232� �2(layers.21.self_attn.o_proj.o_proj.weight8 +@ +input233�� 2*layers.21.self_attn.qkv_proj.v_proj.weight8 +4 +input234� 2 layers.21.input_layernorm.weight8 +: +input235�2&layers.21.self_attn.k_layernorm.weight8 +@ +input236�� 2*layers.21.self_attn.qkv_proj.k_proj.weight8 +: +input237�2&layers.21.self_attn.q_layernorm.weight8 +@ +input238�� 2*layers.21.self_attn.qkv_proj.q_proj.weight8 +4 +input239� �02layers.21.mlp.down_proj.weight8 +2 +input240�0� 2layers.21.mlp.up_proj.weight8 += +input241� 2)layers.21.post_attention_layernorm.weight8 +4 +input242�0� 2layers.21.mlp.gate_proj.weight8 +> +input243� �2(layers.22.self_attn.o_proj.o_proj.weight8 +@ +input244�� 2*layers.22.self_attn.qkv_proj.v_proj.weight8 +4 +input245� 2 layers.22.input_layernorm.weight8 +: +input246�2&layers.22.self_attn.k_layernorm.weight8 +@ +input247�� 2*layers.22.self_attn.qkv_proj.k_proj.weight8 +: +input248�2&layers.22.self_attn.q_layernorm.weight8 +@ +input249�� 2*layers.22.self_attn.qkv_proj.q_proj.weight8 +4 +input250� �02layers.22.mlp.down_proj.weight8 +2 +input251�0� 2layers.22.mlp.up_proj.weight8 += +input252� 2)layers.22.post_attention_layernorm.weight8 +4 +input253�0� 2layers.22.mlp.gate_proj.weight8 +> +input254� �2(layers.23.self_attn.o_proj.o_proj.weight8 +@ +input255�� 2*layers.23.self_attn.qkv_proj.v_proj.weight8 +4 +input256� 2 layers.23.input_layernorm.weight8 +: +input257�2&layers.23.self_attn.k_layernorm.weight8 +@ +input258�� 2*layers.23.self_attn.qkv_proj.k_proj.weight8 +: +input259�2&layers.23.self_attn.q_layernorm.weight8 +@ +input260�� 2*layers.23.self_attn.qkv_proj.q_proj.weight8 +4 +input261� �02layers.23.mlp.down_proj.weight8 +2 +input262�0� 2layers.23.mlp.up_proj.weight8 += +input263� 2)layers.23.post_attention_layernorm.weight8 +4 +input264�0� 2layers.23.mlp.gate_proj.weight8 +> +input265� �2(layers.24.self_attn.o_proj.o_proj.weight8 +@ +input266�� 2*layers.24.self_attn.qkv_proj.v_proj.weight8 +4 +input267� 2 layers.24.input_layernorm.weight8 +: +input268�2&layers.24.self_attn.k_layernorm.weight8 +@ +input269�� 2*layers.24.self_attn.qkv_proj.k_proj.weight8 +: +input270�2&layers.24.self_attn.q_layernorm.weight8 +@ +input271�� 2*layers.24.self_attn.qkv_proj.q_proj.weight8 +4 +input272� �02layers.24.mlp.down_proj.weight8 +2 +input273�0� 2layers.24.mlp.up_proj.weight8 += +input274� 2)layers.24.post_attention_layernorm.weight8 +4 +input275�0� 2layers.24.mlp.gate_proj.weight8 +> +input276� �2(layers.25.self_attn.o_proj.o_proj.weight8 +@ +input277�� 2*layers.25.self_attn.qkv_proj.v_proj.weight8 +4 +input278� 2 layers.25.input_layernorm.weight8 +: +input279�2&layers.25.self_attn.k_layernorm.weight8 +@ +input280�� 2*layers.25.self_attn.qkv_proj.k_proj.weight8 +: +input281�2&layers.25.self_attn.q_layernorm.weight8 +@ +input282�� 2*layers.25.self_attn.qkv_proj.q_proj.weight8 +4 +input283� �02layers.25.mlp.down_proj.weight8 +2 +input284�0� 2layers.25.mlp.up_proj.weight8 += +input285� 2)layers.25.post_attention_layernorm.weight8 +4 +input286�0� 2layers.25.mlp.gate_proj.weight8 +> +input287� �2(layers.26.self_attn.o_proj.o_proj.weight8 +@ +input288�� 2*layers.26.self_attn.qkv_proj.v_proj.weight8 +4 +input289� 2 layers.26.input_layernorm.weight8 +: +input290�2&layers.26.self_attn.k_layernorm.weight8 +@ +input291�� 2*layers.26.self_attn.qkv_proj.k_proj.weight8 +: +input292�2&layers.26.self_attn.q_layernorm.weight8 +@ +input293�� 2*layers.26.self_attn.qkv_proj.q_proj.weight8 +4 +input294� �02layers.26.mlp.down_proj.weight8 +2 +input295�0� 2layers.26.mlp.up_proj.weight8 += +input296� 2)layers.26.post_attention_layernorm.weight8 +4 +input297�0� 2layers.26.mlp.gate_proj.weight8 +> +input298� �2(layers.27.self_attn.o_proj.o_proj.weight8 +@ +input299�� 2*layers.27.self_attn.qkv_proj.v_proj.weight8 +4 +input300� 2 layers.27.input_layernorm.weight8 +: +input301�2&layers.27.self_attn.k_layernorm.weight8 +@ +input302�� 2*layers.27.self_attn.qkv_proj.k_proj.weight8 +: +input303�2&layers.27.self_attn.q_layernorm.weight8 +@ +input304�� 2*layers.27.self_attn.qkv_proj.q_proj.weight8 +4 +input305� �02layers.27.mlp.down_proj.weight8 +2 +input306�0� 2layers.27.mlp.up_proj.weight8 += +input307� 2)layers.27.post_attention_layernorm.weight8 +4 +input308�0� 2layers.27.mlp.gate_proj.weight8 +> +input309� �2(layers.28.self_attn.o_proj.o_proj.weight8 +@ +input310�� 2*layers.28.self_attn.qkv_proj.v_proj.weight8 +4 +input311� 2 layers.28.input_layernorm.weight8 +: +input312�2&layers.28.self_attn.k_layernorm.weight8 +@ +input313�� 2*layers.28.self_attn.qkv_proj.k_proj.weight8 +: +input314�2&layers.28.self_attn.q_layernorm.weight8 +@ +input315�� 2*layers.28.self_attn.qkv_proj.q_proj.weight8 +4 +input316� �02layers.28.mlp.down_proj.weight8 +2 +input317�0� 2layers.28.mlp.up_proj.weight8 += +input318� 2)layers.28.post_attention_layernorm.weight8 +4 +input319�0� 2layers.28.mlp.gate_proj.weight8 +> +input320� �2(layers.29.self_attn.o_proj.o_proj.weight8 +@ +input321�� 2*layers.29.self_attn.qkv_proj.v_proj.weight8 +4 +input322� 2 layers.29.input_layernorm.weight8 +: +input323�2&layers.29.self_attn.k_layernorm.weight8 +@ +input324�� 2*layers.29.self_attn.qkv_proj.k_proj.weight8 +: +input325�2&layers.29.self_attn.q_layernorm.weight8 +@ +input326�� 2*layers.29.self_attn.qkv_proj.q_proj.weight8 +4 +input327� �02layers.29.mlp.down_proj.weight8 +2 +input328�0� 2layers.29.mlp.up_proj.weight8 += +input329� 2)layers.29.post_attention_layernorm.weight8 +4 +input330�0� 2layers.29.mlp.gate_proj.weight8 +> +input331� �2(layers.30.self_attn.o_proj.o_proj.weight8 +@ +input332�� 2*layers.30.self_attn.qkv_proj.v_proj.weight8 +4 +input333� 2 layers.30.input_layernorm.weight8 +: +input334�2&layers.30.self_attn.k_layernorm.weight8 +@ +input335�� 2*layers.30.self_attn.qkv_proj.k_proj.weight8 +: +input336�2&layers.30.self_attn.q_layernorm.weight8 +@ +input337�� 2*layers.30.self_attn.qkv_proj.q_proj.weight8 +4 +input338� �02layers.30.mlp.down_proj.weight8 +2 +input339�0� 2layers.30.mlp.up_proj.weight8 += +input340� 2)layers.30.post_attention_layernorm.weight8 +4 +input341�0� 2layers.30.mlp.gate_proj.weight8 +> +input342� �2(layers.31.self_attn.o_proj.o_proj.weight8 +@ +input343�� 2*layers.31.self_attn.qkv_proj.v_proj.weight8 +4 +input344� 2 layers.31.input_layernorm.weight8 +: +input345�2&layers.31.self_attn.k_layernorm.weight8 +@ +input346�� 2*layers.31.self_attn.qkv_proj.k_proj.weight8 +: +input347�2&layers.31.self_attn.q_layernorm.weight8 +@ +input348�� 2*layers.31.self_attn.qkv_proj.q_proj.weight8 +4 +input349� �02layers.31.mlp.down_proj.weight8 +2 +input350�0� 2layers.31.mlp.up_proj.weight8 += +input351� 2)layers.31.post_attention_layernorm.weight8 +4 +input352�0� 2layers.31.mlp.gate_proj.weight8 +> +input353� �2(layers.32.self_attn.o_proj.o_proj.weight8 +@ +input354�� 2*layers.32.self_attn.qkv_proj.v_proj.weight8 +4 +input355� 2 layers.32.input_layernorm.weight8 +: +input356�2&layers.32.self_attn.k_layernorm.weight8 +@ +input357�� 2*layers.32.self_attn.qkv_proj.k_proj.weight8 +: +input358�2&layers.32.self_attn.q_layernorm.weight8 +@ +input359�� 2*layers.32.self_attn.qkv_proj.q_proj.weight8 +4 +input360� �02layers.32.mlp.down_proj.weight8 +2 +input361�0� 2layers.32.mlp.up_proj.weight8 += +input362� 2)layers.32.post_attention_layernorm.weight8 +4 +input363�0� 2layers.32.mlp.gate_proj.weight8 +> +input364� �2(layers.33.self_attn.o_proj.o_proj.weight8 +@ +input365�� 2*layers.33.self_attn.qkv_proj.v_proj.weight8 +4 +input366� 2 layers.33.input_layernorm.weight8 +: +input367�2&layers.33.self_attn.k_layernorm.weight8 +@ +input368�� 2*layers.33.self_attn.qkv_proj.k_proj.weight8 +: +input369�2&layers.33.self_attn.q_layernorm.weight8 +@ +input370�� 2*layers.33.self_attn.qkv_proj.q_proj.weight8 +4 +input371� �02layers.33.mlp.down_proj.weight8 +2 +input372�0� 2layers.33.mlp.up_proj.weight8 += +input373� 2)layers.33.post_attention_layernorm.weight8 +4 +input374�0� 2layers.33.mlp.gate_proj.weight8 +> +input375� �2(layers.34.self_attn.o_proj.o_proj.weight8 +@ +input376�� 2*layers.34.self_attn.qkv_proj.v_proj.weight8 +4 +input377� 2 layers.34.input_layernorm.weight8 +: +input378�2&layers.34.self_attn.k_layernorm.weight8 +@ +input379�� 2*layers.34.self_attn.qkv_proj.k_proj.weight8 +: +input380�2&layers.34.self_attn.q_layernorm.weight8 +@ +input381�� 2*layers.34.self_attn.qkv_proj.q_proj.weight8 +4 +input382� �02layers.34.mlp.down_proj.weight8 +2 +input383�0� 2layers.34.mlp.up_proj.weight8 += +input384� 2)layers.34.post_attention_layernorm.weight8 +4 +input385�0� 2layers.34.mlp.gate_proj.weight8 +> +input386� �2(layers.35.self_attn.o_proj.o_proj.weight8 +@ +input387�� 2*layers.35.self_attn.qkv_proj.v_proj.weight8 +4 +input388� 2 layers.35.input_layernorm.weight8 +: +input389�2&layers.35.self_attn.k_layernorm.weight8 +@ +input390�� 2*layers.35.self_attn.qkv_proj.k_proj.weight8 +: +input391�2&layers.35.self_attn.q_layernorm.weight8 +@ +input392�� 2*layers.35.self_attn.qkv_proj.q_proj.weight8 +4 +input393� �02layers.35.mlp.down_proj.weight8 +2 +input394�0� 2layers.35.mlp.up_proj.weight8 += +input395� 2)layers.35.post_attention_layernorm.weight8 +4 +input396�0� 2layers.35.mlp.gate_proj.weight8 +% +input397��� 2lm_head.weight8 + +input398� 2 norm.weight8' +output0�� �2embed_tokens.weight> +output1��2'layers.0.self_attn.o_proj.o_proj.weight> +output2� �2)layers.0.self_attn.qkv_proj.v_proj.weight1 +output3� 2layers.0.input_layernorm.weight6 +output4�2%layers.0.self_attn.k_layernorm.weight> +output5� @2)layers.0.self_attn.qkv_proj.k_proj.weight6 +output6�2%layers.0.self_attn.q_layernorm.weight? +output7� @2)layers.0.self_attn.qkv_proj.q_proj.weight3 +output8 ��2layers.0.mlp.down_proj.weight0 +output90� �2layers.0.mlp.up_proj.weight; +output10� 2(layers.0.post_attention_layernorm.weight3 +output110� �2layers.0.mlp.gate_proj.weight? +output12��2'layers.1.self_attn.o_proj.o_proj.weight? +output13� �2)layers.1.self_attn.qkv_proj.v_proj.weight2 +output14� 2layers.1.input_layernorm.weight7 +output15�2%layers.1.self_attn.k_layernorm.weight? +output16� @2)layers.1.self_attn.qkv_proj.k_proj.weight7 +output17�2%layers.1.self_attn.q_layernorm.weight@ +output18� @2)layers.1.self_attn.qkv_proj.q_proj.weight4 +output19 ��2layers.1.mlp.down_proj.weight1 +output200� �2layers.1.mlp.up_proj.weight; +output21� 2(layers.1.post_attention_layernorm.weight3 +output220� �2layers.1.mlp.gate_proj.weight? +output23��2'layers.2.self_attn.o_proj.o_proj.weight? +output24� �2)layers.2.self_attn.qkv_proj.v_proj.weight2 +output25� 2layers.2.input_layernorm.weight7 +output26�2%layers.2.self_attn.k_layernorm.weight? +output27� @2)layers.2.self_attn.qkv_proj.k_proj.weight7 +output28�2%layers.2.self_attn.q_layernorm.weight@ +output29� @2)layers.2.self_attn.qkv_proj.q_proj.weight4 +output30 ��2layers.2.mlp.down_proj.weight1 +output310� �2layers.2.mlp.up_proj.weight; +output32� 2(layers.2.post_attention_layernorm.weight3 +output330� �2layers.2.mlp.gate_proj.weight? +output34��2'layers.3.self_attn.o_proj.o_proj.weight? +output35� �2)layers.3.self_attn.qkv_proj.v_proj.weight2 +output36� 2layers.3.input_layernorm.weight7 +output37�2%layers.3.self_attn.k_layernorm.weight? +output38� @2)layers.3.self_attn.qkv_proj.k_proj.weight7 +output39�2%layers.3.self_attn.q_layernorm.weight@ +output40� @2)layers.3.self_attn.qkv_proj.q_proj.weight4 +output41 ��2layers.3.mlp.down_proj.weight1 +output420� �2layers.3.mlp.up_proj.weight; +output43� 2(layers.3.post_attention_layernorm.weight3 +output440� �2layers.3.mlp.gate_proj.weight? +output45��2'layers.4.self_attn.o_proj.o_proj.weight? +output46� �2)layers.4.self_attn.qkv_proj.v_proj.weight2 +output47� 2layers.4.input_layernorm.weight7 +output48�2%layers.4.self_attn.k_layernorm.weight? +output49� @2)layers.4.self_attn.qkv_proj.k_proj.weight7 +output50�2%layers.4.self_attn.q_layernorm.weight@ +output51� @2)layers.4.self_attn.qkv_proj.q_proj.weight4 +output52 ��2layers.4.mlp.down_proj.weight1 +output530� �2layers.4.mlp.up_proj.weight; +output54� 2(layers.4.post_attention_layernorm.weight3 +output550� �2layers.4.mlp.gate_proj.weight? +output56��2'layers.5.self_attn.o_proj.o_proj.weight? +output57� �2)layers.5.self_attn.qkv_proj.v_proj.weight2 +output58� 2layers.5.input_layernorm.weight7 +output59�2%layers.5.self_attn.k_layernorm.weight? +output60� @2)layers.5.self_attn.qkv_proj.k_proj.weight7 +output61�2%layers.5.self_attn.q_layernorm.weight@ +output62� @2)layers.5.self_attn.qkv_proj.q_proj.weight4 +output63 ��2layers.5.mlp.down_proj.weight1 +output640� �2layers.5.mlp.up_proj.weight; +output65� 2(layers.5.post_attention_layernorm.weight3 +output660� �2layers.5.mlp.gate_proj.weight? +output67��2'layers.6.self_attn.o_proj.o_proj.weight? +output68� �2)layers.6.self_attn.qkv_proj.v_proj.weight2 +output69� 2layers.6.input_layernorm.weight7 +output70�2%layers.6.self_attn.k_layernorm.weight? +output71� @2)layers.6.self_attn.qkv_proj.k_proj.weight7 +output72�2%layers.6.self_attn.q_layernorm.weight@ +output73� @2)layers.6.self_attn.qkv_proj.q_proj.weight4 +output74 ��2layers.6.mlp.down_proj.weight1 +output750� �2layers.6.mlp.up_proj.weight; +output76� 2(layers.6.post_attention_layernorm.weight3 +output770� �2layers.6.mlp.gate_proj.weight? +output78��2'layers.7.self_attn.o_proj.o_proj.weight? +output79� �2)layers.7.self_attn.qkv_proj.v_proj.weight2 +output80� 2layers.7.input_layernorm.weight7 +output81�2%layers.7.self_attn.k_layernorm.weight? +output82� @2)layers.7.self_attn.qkv_proj.k_proj.weight7 +output83�2%layers.7.self_attn.q_layernorm.weight@ +output84� @2)layers.7.self_attn.qkv_proj.q_proj.weight4 +output85 ��2layers.7.mlp.down_proj.weight1 +output860� �2layers.7.mlp.up_proj.weight; +output87� 2(layers.7.post_attention_layernorm.weight3 +output880� �2layers.7.mlp.gate_proj.weight? +output89��2'layers.8.self_attn.o_proj.o_proj.weight? +output90� �2)layers.8.self_attn.qkv_proj.v_proj.weight2 +output91� 2layers.8.input_layernorm.weight7 +output92�2%layers.8.self_attn.k_layernorm.weight? +output93� @2)layers.8.self_attn.qkv_proj.k_proj.weight7 +output94�2%layers.8.self_attn.q_layernorm.weight@ +output95� @2)layers.8.self_attn.qkv_proj.q_proj.weight4 +output96 ��2layers.8.mlp.down_proj.weight1 +output970� �2layers.8.mlp.up_proj.weight; +output98� 2(layers.8.post_attention_layernorm.weight3 +output990� �2layers.8.mlp.gate_proj.weight@ + output100��2'layers.9.self_attn.o_proj.o_proj.weight@ + output101� �2)layers.9.self_attn.qkv_proj.v_proj.weight3 + output102� 2layers.9.input_layernorm.weight8 + output103�2%layers.9.self_attn.k_layernorm.weight@ + output104� @2)layers.9.self_attn.qkv_proj.k_proj.weight8 + output105�2%layers.9.self_attn.q_layernorm.weightA + output106� @2)layers.9.self_attn.qkv_proj.q_proj.weight5 + output107 ��2layers.9.mlp.down_proj.weight2 + output1080� �2layers.9.mlp.up_proj.weight< + output109� 2(layers.9.post_attention_layernorm.weight4 + output1100� �2layers.9.mlp.gate_proj.weightA + output111��2(layers.10.self_attn.o_proj.o_proj.weightA + output112� �2*layers.10.self_attn.qkv_proj.v_proj.weight4 + output113� 2 layers.10.input_layernorm.weight9 + output114�2&layers.10.self_attn.k_layernorm.weightA + output115� @2*layers.10.self_attn.qkv_proj.k_proj.weight9 + output116�2&layers.10.self_attn.q_layernorm.weightB + output117� @2*layers.10.self_attn.qkv_proj.q_proj.weight6 + output118 ��2layers.10.mlp.down_proj.weight3 + output1190� �2layers.10.mlp.up_proj.weight= + output120� 2)layers.10.post_attention_layernorm.weight5 + output1210� �2layers.10.mlp.gate_proj.weightA + output122��2(layers.11.self_attn.o_proj.o_proj.weightA + output123� �2*layers.11.self_attn.qkv_proj.v_proj.weight4 + output124� 2 layers.11.input_layernorm.weight9 + output125�2&layers.11.self_attn.k_layernorm.weightA + output126� @2*layers.11.self_attn.qkv_proj.k_proj.weight9 + output127�2&layers.11.self_attn.q_layernorm.weightB + output128� @2*layers.11.self_attn.qkv_proj.q_proj.weight6 + output129 ��2layers.11.mlp.down_proj.weight3 + output1300� �2layers.11.mlp.up_proj.weight= + output131� 2)layers.11.post_attention_layernorm.weight5 + output1320� �2layers.11.mlp.gate_proj.weightA + output133��2(layers.12.self_attn.o_proj.o_proj.weightA + output134� �2*layers.12.self_attn.qkv_proj.v_proj.weight4 + output135� 2 layers.12.input_layernorm.weight9 + output136�2&layers.12.self_attn.k_layernorm.weightA + output137� @2*layers.12.self_attn.qkv_proj.k_proj.weight9 + output138�2&layers.12.self_attn.q_layernorm.weightB + output139� @2*layers.12.self_attn.qkv_proj.q_proj.weight6 + output140 ��2layers.12.mlp.down_proj.weight3 + output1410� �2layers.12.mlp.up_proj.weight= + output142� 2)layers.12.post_attention_layernorm.weight5 + output1430� �2layers.12.mlp.gate_proj.weightA + output144��2(layers.13.self_attn.o_proj.o_proj.weightA + output145� �2*layers.13.self_attn.qkv_proj.v_proj.weight4 + output146� 2 layers.13.input_layernorm.weight9 + output147�2&layers.13.self_attn.k_layernorm.weightA + output148� @2*layers.13.self_attn.qkv_proj.k_proj.weight9 + output149�2&layers.13.self_attn.q_layernorm.weightB + output150� @2*layers.13.self_attn.qkv_proj.q_proj.weight6 + output151 ��2layers.13.mlp.down_proj.weight3 + output1520� �2layers.13.mlp.up_proj.weight= + output153� 2)layers.13.post_attention_layernorm.weight5 + output1540� �2layers.13.mlp.gate_proj.weightA + output155��2(layers.14.self_attn.o_proj.o_proj.weightA + output156� �2*layers.14.self_attn.qkv_proj.v_proj.weight4 + output157� 2 layers.14.input_layernorm.weight9 + output158�2&layers.14.self_attn.k_layernorm.weightA + output159� @2*layers.14.self_attn.qkv_proj.k_proj.weight9 + output160�2&layers.14.self_attn.q_layernorm.weightB + output161� @2*layers.14.self_attn.qkv_proj.q_proj.weight6 + output162 ��2layers.14.mlp.down_proj.weight3 + output1630� �2layers.14.mlp.up_proj.weight= + output164� 2)layers.14.post_attention_layernorm.weight5 + output1650� �2layers.14.mlp.gate_proj.weightA + output166��2(layers.15.self_attn.o_proj.o_proj.weightA + output167� �2*layers.15.self_attn.qkv_proj.v_proj.weight4 + output168� 2 layers.15.input_layernorm.weight9 + output169�2&layers.15.self_attn.k_layernorm.weightA + output170� @2*layers.15.self_attn.qkv_proj.k_proj.weight9 + output171�2&layers.15.self_attn.q_layernorm.weightB + output172� @2*layers.15.self_attn.qkv_proj.q_proj.weight6 + output173 ��2layers.15.mlp.down_proj.weight3 + output1740� �2layers.15.mlp.up_proj.weight= + output175� 2)layers.15.post_attention_layernorm.weight5 + output1760� �2layers.15.mlp.gate_proj.weightA + output177��2(layers.16.self_attn.o_proj.o_proj.weightA + output178� �2*layers.16.self_attn.qkv_proj.v_proj.weight4 + output179� 2 layers.16.input_layernorm.weight9 + output180�2&layers.16.self_attn.k_layernorm.weightA + output181� @2*layers.16.self_attn.qkv_proj.k_proj.weight9 + output182�2&layers.16.self_attn.q_layernorm.weightB + output183� @2*layers.16.self_attn.qkv_proj.q_proj.weight6 + output184 ��2layers.16.mlp.down_proj.weight3 + output1850� �2layers.16.mlp.up_proj.weight= + output186� 2)layers.16.post_attention_layernorm.weight5 + output1870� �2layers.16.mlp.gate_proj.weightA + output188��2(layers.17.self_attn.o_proj.o_proj.weightA + output189� �2*layers.17.self_attn.qkv_proj.v_proj.weight4 + output190� 2 layers.17.input_layernorm.weight9 + output191�2&layers.17.self_attn.k_layernorm.weightA + output192� @2*layers.17.self_attn.qkv_proj.k_proj.weight9 + output193�2&layers.17.self_attn.q_layernorm.weightB + output194� @2*layers.17.self_attn.qkv_proj.q_proj.weight6 + output195 ��2layers.17.mlp.down_proj.weight3 + output1960� �2layers.17.mlp.up_proj.weight= + output197� 2)layers.17.post_attention_layernorm.weight5 + output1980� �2layers.17.mlp.gate_proj.weightA + output199��2(layers.18.self_attn.o_proj.o_proj.weightA + output200� �2*layers.18.self_attn.qkv_proj.v_proj.weight4 + output201� 2 layers.18.input_layernorm.weight9 + output202�2&layers.18.self_attn.k_layernorm.weightA + output203� @2*layers.18.self_attn.qkv_proj.k_proj.weight9 + output204�2&layers.18.self_attn.q_layernorm.weightB + output205� @2*layers.18.self_attn.qkv_proj.q_proj.weight6 + output206 ��2layers.18.mlp.down_proj.weight3 + output2070� �2layers.18.mlp.up_proj.weight= + output208� 2)layers.18.post_attention_layernorm.weight5 + output2090� �2layers.18.mlp.gate_proj.weightA + output210��2(layers.19.self_attn.o_proj.o_proj.weightA + output211� �2*layers.19.self_attn.qkv_proj.v_proj.weight4 + output212� 2 layers.19.input_layernorm.weight9 + output213�2&layers.19.self_attn.k_layernorm.weightA + output214� @2*layers.19.self_attn.qkv_proj.k_proj.weight9 + output215�2&layers.19.self_attn.q_layernorm.weightB + output216� @2*layers.19.self_attn.qkv_proj.q_proj.weight6 + output217 ��2layers.19.mlp.down_proj.weight3 + output2180� �2layers.19.mlp.up_proj.weight= + output219� 2)layers.19.post_attention_layernorm.weight5 + output2200� �2layers.19.mlp.gate_proj.weightA + output221��2(layers.20.self_attn.o_proj.o_proj.weightA + output222� �2*layers.20.self_attn.qkv_proj.v_proj.weight4 + output223� 2 layers.20.input_layernorm.weight9 + output224�2&layers.20.self_attn.k_layernorm.weightA + output225� @2*layers.20.self_attn.qkv_proj.k_proj.weight9 + output226�2&layers.20.self_attn.q_layernorm.weightB + output227� @2*layers.20.self_attn.qkv_proj.q_proj.weight6 + output228 ��2layers.20.mlp.down_proj.weight3 + output2290� �2layers.20.mlp.up_proj.weight= + output230� 2)layers.20.post_attention_layernorm.weight5 + output2310� �2layers.20.mlp.gate_proj.weightA + output232��2(layers.21.self_attn.o_proj.o_proj.weightA + output233� �2*layers.21.self_attn.qkv_proj.v_proj.weight4 + output234� 2 layers.21.input_layernorm.weight9 + output235�2&layers.21.self_attn.k_layernorm.weightA + output236� @2*layers.21.self_attn.qkv_proj.k_proj.weight9 + output237�2&layers.21.self_attn.q_layernorm.weightB + output238� @2*layers.21.self_attn.qkv_proj.q_proj.weight6 + output239 ��2layers.21.mlp.down_proj.weight3 + output2400� �2layers.21.mlp.up_proj.weight= + output241� 2)layers.21.post_attention_layernorm.weight5 + output2420� �2layers.21.mlp.gate_proj.weightA + output243��2(layers.22.self_attn.o_proj.o_proj.weightA + output244� �2*layers.22.self_attn.qkv_proj.v_proj.weight4 + output245� 2 layers.22.input_layernorm.weight9 + output246�2&layers.22.self_attn.k_layernorm.weightA + output247� @2*layers.22.self_attn.qkv_proj.k_proj.weight9 + output248�2&layers.22.self_attn.q_layernorm.weightB + output249� @2*layers.22.self_attn.qkv_proj.q_proj.weight6 + output250 ��2layers.22.mlp.down_proj.weight3 + output2510� �2layers.22.mlp.up_proj.weight= + output252� 2)layers.22.post_attention_layernorm.weight5 + output2530� �2layers.22.mlp.gate_proj.weightA + output254��2(layers.23.self_attn.o_proj.o_proj.weightA + output255� �2*layers.23.self_attn.qkv_proj.v_proj.weight4 + output256� 2 layers.23.input_layernorm.weight9 + output257�2&layers.23.self_attn.k_layernorm.weightA + output258� @2*layers.23.self_attn.qkv_proj.k_proj.weight9 + output259�2&layers.23.self_attn.q_layernorm.weightB + output260� @2*layers.23.self_attn.qkv_proj.q_proj.weight6 + output261 ��2layers.23.mlp.down_proj.weight3 + output2620� �2layers.23.mlp.up_proj.weight= + output263� 2)layers.23.post_attention_layernorm.weight5 + output2640� �2layers.23.mlp.gate_proj.weightA + output265��2(layers.24.self_attn.o_proj.o_proj.weightA + output266� �2*layers.24.self_attn.qkv_proj.v_proj.weight4 + output267� 2 layers.24.input_layernorm.weight9 + output268�2&layers.24.self_attn.k_layernorm.weightA + output269� @2*layers.24.self_attn.qkv_proj.k_proj.weight9 + output270�2&layers.24.self_attn.q_layernorm.weightB + output271� @2*layers.24.self_attn.qkv_proj.q_proj.weight6 + output272 ��2layers.24.mlp.down_proj.weight3 + output2730� �2layers.24.mlp.up_proj.weight= + output274� 2)layers.24.post_attention_layernorm.weight5 + output2750� �2layers.24.mlp.gate_proj.weightA + output276��2(layers.25.self_attn.o_proj.o_proj.weightA + output277� �2*layers.25.self_attn.qkv_proj.v_proj.weight4 + output278� 2 layers.25.input_layernorm.weight9 + output279�2&layers.25.self_attn.k_layernorm.weightA + output280� @2*layers.25.self_attn.qkv_proj.k_proj.weight9 + output281�2&layers.25.self_attn.q_layernorm.weightB + output282� @2*layers.25.self_attn.qkv_proj.q_proj.weight6 + output283 ��2layers.25.mlp.down_proj.weight3 + output2840� �2layers.25.mlp.up_proj.weight= + output285� 2)layers.25.post_attention_layernorm.weight5 + output2860� �2layers.25.mlp.gate_proj.weightA + output287��2(layers.26.self_attn.o_proj.o_proj.weightA + output288� �2*layers.26.self_attn.qkv_proj.v_proj.weight4 + output289� 2 layers.26.input_layernorm.weight9 + output290�2&layers.26.self_attn.k_layernorm.weightA + output291� @2*layers.26.self_attn.qkv_proj.k_proj.weight9 + output292�2&layers.26.self_attn.q_layernorm.weightB + output293� @2*layers.26.self_attn.qkv_proj.q_proj.weight6 + output294 ��2layers.26.mlp.down_proj.weight3 + output2950� �2layers.26.mlp.up_proj.weight= + output296� 2)layers.26.post_attention_layernorm.weight5 + output2970� �2layers.26.mlp.gate_proj.weightA + output298��2(layers.27.self_attn.o_proj.o_proj.weightA + output299� �2*layers.27.self_attn.qkv_proj.v_proj.weight4 + output300� 2 layers.27.input_layernorm.weight9 + output301�2&layers.27.self_attn.k_layernorm.weightA + output302� @2*layers.27.self_attn.qkv_proj.k_proj.weight9 + output303�2&layers.27.self_attn.q_layernorm.weightB + output304� @2*layers.27.self_attn.qkv_proj.q_proj.weight6 + output305 ��2layers.27.mlp.down_proj.weight3 + output3060� �2layers.27.mlp.up_proj.weight= + output307� 2)layers.27.post_attention_layernorm.weight5 + output3080� �2layers.27.mlp.gate_proj.weightA + output309��2(layers.28.self_attn.o_proj.o_proj.weightA + output310� �2*layers.28.self_attn.qkv_proj.v_proj.weight4 + output311� 2 layers.28.input_layernorm.weight9 + output312�2&layers.28.self_attn.k_layernorm.weightA + output313� @2*layers.28.self_attn.qkv_proj.k_proj.weight9 + output314�2&layers.28.self_attn.q_layernorm.weightB + output315� @2*layers.28.self_attn.qkv_proj.q_proj.weight6 + output316 ��2layers.28.mlp.down_proj.weight3 + output3170� �2layers.28.mlp.up_proj.weight= + output318� 2)layers.28.post_attention_layernorm.weight5 + output3190� �2layers.28.mlp.gate_proj.weightA + output320��2(layers.29.self_attn.o_proj.o_proj.weightA + output321� �2*layers.29.self_attn.qkv_proj.v_proj.weight4 + output322� 2 layers.29.input_layernorm.weight9 + output323�2&layers.29.self_attn.k_layernorm.weightA + output324� @2*layers.29.self_attn.qkv_proj.k_proj.weight9 + output325�2&layers.29.self_attn.q_layernorm.weightB + output326� @2*layers.29.self_attn.qkv_proj.q_proj.weight6 + output327 ��2layers.29.mlp.down_proj.weight3 + output3280� �2layers.29.mlp.up_proj.weight= + output329� 2)layers.29.post_attention_layernorm.weight5 + output3300� �2layers.29.mlp.gate_proj.weightA + output331��2(layers.30.self_attn.o_proj.o_proj.weightA + output332� �2*layers.30.self_attn.qkv_proj.v_proj.weight4 + output333� 2 layers.30.input_layernorm.weight9 + output334�2&layers.30.self_attn.k_layernorm.weightA + output335� @2*layers.30.self_attn.qkv_proj.k_proj.weight9 + output336�2&layers.30.self_attn.q_layernorm.weightB + output337� @2*layers.30.self_attn.qkv_proj.q_proj.weight6 + output338 ��2layers.30.mlp.down_proj.weight3 + output3390� �2layers.30.mlp.up_proj.weight= + output340� 2)layers.30.post_attention_layernorm.weight5 + output3410� �2layers.30.mlp.gate_proj.weightA + output342��2(layers.31.self_attn.o_proj.o_proj.weightA + output343� �2*layers.31.self_attn.qkv_proj.v_proj.weight4 + output344� 2 layers.31.input_layernorm.weight9 + output345�2&layers.31.self_attn.k_layernorm.weightA + output346� @2*layers.31.self_attn.qkv_proj.k_proj.weight9 + output347�2&layers.31.self_attn.q_layernorm.weightB + output348� @2*layers.31.self_attn.qkv_proj.q_proj.weight6 + output349 ��2layers.31.mlp.down_proj.weight3 + output3500� �2layers.31.mlp.up_proj.weight= + output351� 2)layers.31.post_attention_layernorm.weight5 + output3520� �2layers.31.mlp.gate_proj.weightA + output353��2(layers.32.self_attn.o_proj.o_proj.weightA + output354� �2*layers.32.self_attn.qkv_proj.v_proj.weight4 + output355� 2 layers.32.input_layernorm.weight9 + output356�2&layers.32.self_attn.k_layernorm.weightA + output357� @2*layers.32.self_attn.qkv_proj.k_proj.weight9 + output358�2&layers.32.self_attn.q_layernorm.weightB + output359� @2*layers.32.self_attn.qkv_proj.q_proj.weight6 + output360 ��2layers.32.mlp.down_proj.weight3 + output3610� �2layers.32.mlp.up_proj.weight= + output362� 2)layers.32.post_attention_layernorm.weight5 + output3630� �2layers.32.mlp.gate_proj.weightA + output364��2(layers.33.self_attn.o_proj.o_proj.weightA + output365� �2*layers.33.self_attn.qkv_proj.v_proj.weight4 + output366� 2 layers.33.input_layernorm.weight9 + output367�2&layers.33.self_attn.k_layernorm.weightA + output368� @2*layers.33.self_attn.qkv_proj.k_proj.weight9 + output369�2&layers.33.self_attn.q_layernorm.weightB + output370� @2*layers.33.self_attn.qkv_proj.q_proj.weight6 + output371 ��2layers.33.mlp.down_proj.weight3 + output3720� �2layers.33.mlp.up_proj.weight= + output373� 2)layers.33.post_attention_layernorm.weight5 + output3740� �2layers.33.mlp.gate_proj.weightA + output375��2(layers.34.self_attn.o_proj.o_proj.weightA + output376� �2*layers.34.self_attn.qkv_proj.v_proj.weight4 + output377� 2 layers.34.input_layernorm.weight9 + output378�2&layers.34.self_attn.k_layernorm.weightA + output379� @2*layers.34.self_attn.qkv_proj.k_proj.weight9 + output380�2&layers.34.self_attn.q_layernorm.weightB + output381� @2*layers.34.self_attn.qkv_proj.q_proj.weight6 + output382 ��2layers.34.mlp.down_proj.weight3 + output3830� �2layers.34.mlp.up_proj.weight= + output384� 2)layers.34.post_attention_layernorm.weight5 + output3850� �2layers.34.mlp.gate_proj.weightA + output386��2(layers.35.self_attn.o_proj.o_proj.weightA + output387� �2*layers.35.self_attn.qkv_proj.v_proj.weight4 + output388� 2 layers.35.input_layernorm.weight9 + output389�2&layers.35.self_attn.k_layernorm.weightA + output390� @2*layers.35.self_attn.qkv_proj.k_proj.weight9 + output391�2&layers.35.self_attn.q_layernorm.weightB + output392� @2*layers.35.self_attn.qkv_proj.q_proj.weight6 + output393 ��2layers.35.mlp.down_proj.weight3 + output3940� �2layers.35.mlp.up_proj.weight= + output395� 2)layers.35.post_attention_layernorm.weight5 + output3960� �2layers.35.mlp.gate_proj.weight$ + output397��� 2lm_head.weight + output398� 2 norm.weight \ No newline at end of file diff --git a/layout_opt/model/graph.hlo b/layout_opt/model/graph.hlo new file mode 100644 index 0000000000000000000000000000000000000000..2d092f6ea8729a6137dcb6d4e5ac48d0ea43f55a --- /dev/null +++ b/layout_opt/model/graph.hlo @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12b45b028e502b2dd8c42c1287fbdbea434454143a30d473806853bc18673d98 +size 211060 diff --git a/model.pt b/model.pt new file mode 100644 index 0000000000000000000000000000000000000000..d38726a91d0bb39dcff29932fad507760f80c31e --- /dev/null +++ b/model.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c36077018a9f85728962cc73bfcba755ce1d5d5b6f608dacf65d7b95596eb109 +size 47198475 diff --git a/neuron_config.json b/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5bc77c2eb8f4baae782ec6543d174f5471361ade --- /dev/null +++ b/neuron_config.json @@ -0,0 +1,218 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "Qwen/Qwen3-8B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 12288, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 36, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 1, + "bucket_n_active_tokens": false, + "buckets": [ + 1024 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 2, + "chunked_prefill_config": null, + "context_encoding_buckets": null, + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": null, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 1, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": false, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 1, + "lora_config": null, + "max_batch_size": 1, + "max_context_length": 1024, + "max_length": 1024, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 1024, + "n_positions": 1024, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 1024, + "pa_num_blocks": 1, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 1024, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 1, + "speculation_length": 0, + "start_rank_id": 0, + "target": null, + "tile_cc": false, + "tkg_batch_size": 1, + "token_generation_buckets": null, + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 32, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/token_generation_model/_tp0_bk0/command.txt b/token_generation_model/_tp0_bk0/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a4b25ed5f74d447c75f0ba6bd92c492a71957cd --- /dev/null +++ b/token_generation_model/_tp0_bk0/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.hlo_module.pb --output model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ' --lnc=1 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=log-neuron-cc.txt --enable-internal-neff-wrapper --verbose=35 \ No newline at end of file diff --git a/token_generation_model/_tp0_bk0/compile_flags.MODULE_6ef5ba8b41fbbe77f080+74ae8282.json b/token_generation_model/_tp0_bk0/compile_flags.MODULE_6ef5ba8b41fbbe77f080+74ae8282.json new file mode 100644 index 0000000000000000000000000000000000000000..065ffdfb88232d46693707dfda2b4bd142ecb2ef --- /dev/null +++ b/token_generation_model/_tp0_bk0/compile_flags.MODULE_6ef5ba8b41fbbe77f080+74ae8282.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ", "--lnc=1", "-O2", "--internal-hlo2tensorizer-options=--verify-hlo=true", "--logfile=/home/ubuntu/qwen3/token_generation_model/_tp0_bk0/log-neuron-cc.txt", "--enable-internal-neff-wrapper"] \ No newline at end of file diff --git a/token_generation_model/_tp0_bk0/global_metric_store.json b/token_generation_model/_tp0_bk0/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..cb1886bf56dceea0a122f130df122abebb7e81ea --- /dev/null +++ b/token_generation_model/_tp0_bk0/global_metric_store.json @@ -0,0 +1,540 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 99.8321762084961, + "StaticProfiler::AveragePartitionUtilization": 99.3888168334961, + "StaticProfiler::AveragePeUtilization": 99.65400695800781, + "StaticProfiler::LocalizationEfficiency": 109.9806137084961, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 110.06793212890625, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1, + "StaticProfiler::AveragePartitionUtilization": 1, + "StaticProfiler::AveragePeUtilization": 1, + "StaticProfiler::LocalizationEfficiency": 1, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 1.4457588195800781, + "AffinePredicateResolution": 0.05167531967163086, + "AliasDependencyElimination": 0.0026276111602783203, + "AliasDependencyInduction": 0.44934630393981934, + "AliasDependencyReset": 1.2677826881408691, + "BFComputeCutting": 0.06423807144165039, + "BirCodeGenLoop": 2.421293258666992, + "CCOpFusion": 0.41050028800964355, + "CanonicalizeConv": 9.999999974752427e-07, + "CanonicalizeDAGForPGTiling": 0.21233797073364258, + "CanonicalizeForTensorizer": 0.0003640000068116933, + "CanonicalizeIR": 0.06626629829406738, + "Canonicalizer": 0.007044999860227108, + "CoalesceCCOp": 0.19146490097045898, + "CommuteConcat": 0.03319668769836426, + "DMALocalityOpt": 0.035207271575927734, + "DMAProfiler": 0.08866691589355469, + "DMATilingProfiler": 0.07109546661376953, + "DataLocalityOpt": 1.910703182220459, + "DataStreaming": 0.15389323234558105, + "DeConcat": 0.012087583541870117, + "DeadCodeElimination": 0.035611867904663086, + "DeadStoreElimination": 0.37193870544433594, + "DelinearIndices": 0.2894127368927002, + "Delinearization": 0.1295926570892334, + "DoNothing": 0.00019550323486328125, + "DramToDramTranspose": 1.0679569244384766, + "DumpGraphAndMetadata": 0.24142217636108398, + "EliminateDivs": 0.17337489128112793, + "ExpandBatchNorm": 0.06027984619140625, + "ExpandISAMacro": 0.0909569263458252, + "FactorizeBlkDims": 0.24945974349975586, + "FactorizeThreadAxesInFreeDims": 0.03613853454589844, + "FlattenMacroLoop": 0.26774168014526367, + "GenericAccessSimplifier": 0.03175926208496094, + "HoistCompute": 4.8000001697801054e-05, + "IdentifyCrossPassTensors": 0.00013600000238511711, + "InferInitValue": 1.029360294342041, + "InferIntrinsicOnCC": 0.34307408332824707, + "InferNeuronTensor": 1.7935998439788818, + "InferNonlocalTensors": 3.6307339668273926, + "InferPSumTensor": 0.9782986640930176, + "InlineNativeKernels": 0.05374264717102051, + "InsertIOTransposes": 1.162278652191162, + "InsertLocalTransposes": 1.0349645614624023, + "InsertOffloadedTransposes": 0.0943443775177002, + "LICM": 0.1061861515045166, + "LateLegalizeInst": 0.22754216194152832, + "LateLegalizePostSplit": 0.09247255325317383, + "LateLowerReshapeOp": 0.04053616523742676, + "LateLowerTensorOp": 0.3356895446777344, + "LateNeuronInstComb": 0.4516925811767578, + "LayoutPreprocessing": 0.9441671371459961, + "LayoutPreprocessingAndAnalysis": 1.2680203914642334, + "LayoutRequirementAnalysis": 0.309098482131958, + "LegalizeCCOpLayout": 0.07318258285522461, + "LegalizeOpLevelAlias": 0.03343796730041504, + "LegalizePartitionReduce": 0.034781694412231445, + "LegalizeSundaAccess": 1.4558701515197754, + "LegalizeSundaMacro": 0.37755250930786133, + "LegalizeType": 0.20858454704284668, + "LocalLayoutOpt": 0.36218762397766113, + "LoopFusion": 0.31240200996398926, + "LoopSplitting": 0.013066768646240234, + "LowerBroadcast": 0.047890663146972656, + "LowerCCOpBlockAxis": 0.23094987869262695, + "LowerComplexBroadcast": 0.15572404861450195, + "LowerIntrinsics": 1.228858470916748, + "LowerTensorOp": 0.4897449016571045, + "LowerTranspose": 0.3995330333709717, + "MacroGeneration": 2.335334062576294, + "MaskPropagation": 0.14433836936950684, + "MemcastMotion": 0.00013000000035390258, + "MemcpyElimination": 3.9867260456085205, + "MutateDataType": 0.04344511032104492, + "NeuronAliasDependencyInduction": 0.025929927825927734, + "NeuronAliasDependencyReset": 0.04254412651062012, + "NeuronInstComb": 0.19350981712341309, + "NeuronLICM": 0.2897522449493408, + "NeuronLoopFusion": 0.4089043140411377, + "NeuronLoopInterchange": 0.04476189613342285, + "NeuronSimplifier": 0.30055856704711914, + "NeuronSimplifyPredicates": 0.18221426010131836, + "NeuronValueNumbering": 0.10663247108459473, + "OptimizeAliasedCopyChain": 0.01511383056640625, + "OptimizeNKIKernels": 0.4606451988220215, + "PAGLayoutOpt": 26.32272720336914, + "PComputeCutting": 0.302201509475708, + "PGLayoutTilingPipeline": 38.88710403442383, + "PGTiling": 4.423768043518066, + "PadElimination": 0.008622884750366211, + "ParAxesAnnotation": 25.272018432617188, + "PartialLoopFusion": 0.2368309497833252, + "PartialSimdFusion": 0.20722246170043945, + "PenguinizeFunctions": 0.00015999999595806003, + "PerfectLoopNest": 0.06273055076599121, + "PruneFunctions": 0.00016700000560376793, + "RecognizeOpIdiom": 0.20455479621887207, + "Recompute": 0.00649714469909668, + "RelaxPredicates": 0.154876708984375, + "Rematerialization": 0.16764259338378906, + "RemoveOptimizationBarriers": 0.00014099999680183828, + "ReshapeWeights": 0.021569013595581055, + "ResolveAccessConflict": 0.24012255668640137, + "ResolveComplicatePredicates": 0.05034017562866211, + "RewriteReplicationMatmul": 0.04589343070983887, + "RewriteWeights": 0.05840659141540527, + "SFKVectorizer": 3.1227571964263916, + "ScatterMotion": 0.0041600000113248825, + "SimpleAllReduceTiling": 0.06594347953796387, + "Simplifier": 0.11366057395935059, + "SimplifyMacroPredicates": 0.18840670585632324, + "SimplifyNeuronTensor": 1.3299446105957031, + "SimplifySlice": 0.03386688232421875, + "SimplifyTensor": 0.21405529975891113, + "SpillPSum": 0.5441117286682129, + "SplitAPUnionSets": 0.3313255310058594, + "SplitAccGrp": 0.03839588165283203, + "StaticProfiler": 0.13296246528625488, + "StaticTransposeLocalTensor": 0.21724367141723633, + "SundaISel": 1.6302134990692139, + "TCTransform": 0.03438615798950195, + "TensorInitialization": 0.13414645195007324, + "TensorOpSimplifier": 0.27712535858154297, + "TensorOpTransform": 0.8646912574768066, + "TensorizerLegalizationPass": 0.000155999994603917, + "TileCCOps": 0.263721227645874, + "TilingProfiler": 0.39296984672546387, + "TransformConvOp": 0.06336498260498047, + "TritiumFusion": 1.0901517868041992, + "ValueNumbering": 0.09328150749206543, + "VectorizeDMA": 0.03394460678100586, + "VectorizeMatMult": 0.0209348201751709, + "VerifySupportedOps": 0.00023200000578071922, + "WeightCoalescing": 0.05484199523925781, + "ZeroSizeTensorElimination": 0.0004336833953857422, + "algsimp": 0.0020280000753700733, + "batchnorm_expander": 0.0007249999907799065, + "boundary-marker-removal": 0.0004140000091865659, + "call-inliner": 0.0002570000069681555, + "canonicalize-boundary-marker": 0.00044800000614486635, + "collective-stream-id-checker": 7.000000186963007e-05, + "comparison-expander": 0.00041700000292621553, + "computation-deduplicator": 0.0004440000047907233, + "conditional-to-select": 8.70000003487803e-05, + "config-lowering": 0.00020700000459328294, + "constant_folding": 0.00016900000628083944, + "cse": 0.00043799998820759356, + "dce": 3.899999865097925e-05, + "dynamic-slice-transpose": 0.00015799999528098851, + "eliminate-redundant-compare": 0.0001539999939268455, + "emit-offloaded-dropout": 0.0002770000137388706, + "flatten-call-graph": 0.000299000006634742, + "fuse-send-recv": 0.0015030000358819962, + "hilo::LegalizeAlias": 0.003281000070273876, + "hilo::NeuronInstCombine": 0.0011020000092685223, + "hilo::NeuronOpFusion": 0.0003429999924264848, + "hilo::ReplaceTokenTypeWithU8Pass": 0.00018600000475998968, + "hilo::ScheduleFusion": 3.5000000934815034e-05, + "hilo::SixtyFourHack": 0.00020599999697878957, + "hilo::VerifyAliasing": 7.000000186963007e-05, + "hlo-mac-count": 0.0006559999892488122, + "hlo-verifier": 0.006031000055372715, + "io-con-pipe-begin": 4.999999873689376e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0009500000160187483, + "legalize-ccops": 1.700000029813964e-05, + "legalize-compare": 0.00036899998667649925, + "lower-argminmax-custom-call": 0.00013800000306218863, + "map-inline": 0.0006319999811239541, + "metadata-naming": 0.0009749999735504389, + "mlir::detail::OpToOpPassAdaptor": 0.00022499999613501132, + "mlir::hlo::MhloToPyPenguin": 0.025104999542236328, + "mlir::mhlo::LowerComplexExtraPass": 0.002770999912172556, + "mlir::mhlo::LowerComplexPass": 0.001180000021122396, + "native-to-custom-softmax": 0.00041199999395757914, + "native-to-custom-softmax-dx": 0.00042600001324899495, + "operand_upcaster": 0.0007089999853633344, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.05639899894595146, + "pre-hlo-begin": 4.999999873689376e-06, + "pre-hlo-end": 9.999999974752427e-07, + "replace-minimum-constant": 0.0002209999947808683, + "reshape-mover": 7.400000322377309e-05, + "simplify-concat": 0.0018210000125691295, + "simplify-while-loops": 5.500000042957254e-05, + "transform-variadic-reduce": 0.0006440000142902136, + "tuple-simplifier": 0.00016700000560376793, + "unpack-nested-aws-ntwsr": 0.00035700001171790063, + "unroll-while-loop": 1.1000000085914508e-05 + }, + "hilo": { + "HloMacCount": 3802996736.0, + "Traffic": 8267154432.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 262321, + "StaticProfiler::AifUb": 10.559271812438965, + "StaticProfiler::ArithmeticIntensityTensorizer": 11.613152503967285, + "StaticProfiler::AverageDmaLength": 6652.8759765625, + "StaticProfiler::DDRTransferBytes": 7587185496, + "StaticProfiler::InternalTransferBytes": 632323092, + "StaticProfiler::LoadExpanded": 1033407, + "StaticProfiler::StoreExpanded": 3422, + "StaticProfiler::TotalDMAExpanded": 1036829, + "StaticProfiler::TotalDynamicInstancesCount": 275548, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 274994, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 79, + "TilingProfiler::MatMultInstructionsAfterTiling": 231408, + "TilingProfiler::NumPfTransposes": 398, + "TilingProfiler::NumPfTransposesForIo": 37, + "TilingProfiler::NumPfTransposesForLocal": 216, + "TilingProfiler::NumPfTransposesForNonlocal": 145, + "TilingProfiler::PfTransposeInstructions": 19513, + "TilingProfiler::PfTransposeInstructionsForIo": 19152, + "TilingProfiler::PfTransposeInstructionsForLocal": 216, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 145, + "TilingProfiler::ReduceInstructionsAfterTiling": 74, + "TilingProfiler::SimdInstructionsAfterTiling": 2999, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + }, + "all": { + "compiletime": { + "CanonicalizeConv": 9.999999974752427e-07, + "CanonicalizeForTensorizer": 0.0003640000068116933, + "Canonicalizer": 0.007044999860227108, + "HoistCompute": 4.8000001697801054e-05, + "IdentifyCrossPassTensors": 0.00013600000238511711, + "MemcastMotion": 0.00013000000035390258, + "PenguinizeFunctions": 0.00015999999595806003, + "PruneFunctions": 0.00016700000560376793, + "RemoveOptimizationBarriers": 0.00014099999680183828, + "ScatterMotion": 0.0041600000113248825, + "TensorizerLegalizationPass": 0.000155999994603917, + "VerifySupportedOps": 0.00023200000578071922, + "algsimp": 0.0020280000753700733, + "batchnorm_expander": 0.0007249999907799065, + "boundary-marker-removal": 0.0004140000091865659, + "call-inliner": 0.0002570000069681555, + "canonicalize-boundary-marker": 0.00044800000614486635, + "collective-stream-id-checker": 7.000000186963007e-05, + "comparison-expander": 0.00041700000292621553, + "computation-deduplicator": 0.0004440000047907233, + "conditional-to-select": 8.70000003487803e-05, + "config-lowering": 0.00020700000459328294, + "constant_folding": 0.00016900000628083944, + "cse": 0.00043799998820759356, + "dce": 3.899999865097925e-05, + "dynamic-slice-transpose": 0.00015799999528098851, + "eliminate-redundant-compare": 0.0001539999939268455, + "emit-offloaded-dropout": 0.0002770000137388706, + "flatten-call-graph": 0.000299000006634742, + "fuse-send-recv": 0.0015030000358819962, + "hilo::LegalizeAlias": 0.003281000070273876, + "hilo::NeuronInstCombine": 0.0011020000092685223, + "hilo::NeuronOpFusion": 0.0003429999924264848, + "hilo::ReplaceTokenTypeWithU8Pass": 0.00018600000475998968, + "hilo::ScheduleFusion": 3.5000000934815034e-05, + "hilo::SixtyFourHack": 0.00020599999697878957, + "hilo::VerifyAliasing": 7.000000186963007e-05, + "hlo-mac-count": 0.0006559999892488122, + "hlo-verifier": 0.006031000055372715, + "io-con-pipe-begin": 4.999999873689376e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0009500000160187483, + "legalize-ccops": 1.700000029813964e-05, + "legalize-compare": 0.00036899998667649925, + "lower-argminmax-custom-call": 0.00013800000306218863, + "map-inline": 0.0006319999811239541, + "metadata-naming": 0.0009749999735504389, + "mlir::detail::OpToOpPassAdaptor": 0.00022499999613501132, + "mlir::hlo::MhloToPyPenguin": 0.025104999542236328, + "mlir::mhlo::LowerComplexExtraPass": 0.002770999912172556, + "mlir::mhlo::LowerComplexPass": 0.001180000021122396, + "native-to-custom-softmax": 0.00041199999395757914, + "native-to-custom-softmax-dx": 0.00042600001324899495, + "operand_upcaster": 0.0007089999853633344, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.05639899894595146, + "pre-hlo-begin": 4.999999873689376e-06, + "pre-hlo-end": 9.999999974752427e-07, + "replace-minimum-constant": 0.0002209999947808683, + "reshape-mover": 7.400000322377309e-05, + "simplify-concat": 0.0018210000125691295, + "simplify-while-loops": 5.500000042957254e-05, + "transform-variadic-reduce": 0.0006440000142902136, + "tuple-simplifier": 0.00016700000560376793, + "unpack-nested-aws-ntwsr": 0.00035700001171790063, + "unroll-while-loop": 1.1000000085914508e-05 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.0008378028869628906, + "DMALocalityOpt": 0.0003306865692138672, + "DMAProfiler": 0.0007596015930175781, + "DataStreaming": 0.0002918243408203125, + "DoNothing": 0.00012636184692382813, + "ExpandISAMacro": 0.0005497932434082031, + "FactorizeBlkDims": 0.0004723072052001953, + "InferPSumTensor": 0.000583648681640625, + "LateLegalizeInst": 0.00040459632873535156, + "LateNeuronInstComb": 0.0004837512969970703, + "LegalizeSundaAccess": 0.0015611648559570313, + "LegalizeType": 0.00025010108947753906, + "LowerBroadcast": 0.0009808540344238281, + "LowerIntrinsics": 0.0002262592315673828, + "LowerTranspose": 0.00021767616271972656, + "NeuronInstComb": 0.0004963874816894531, + "NeuronLICM": 0.0006859302520751953, + "NeuronSimplifyPredicates": 0.002815723419189453, + "NeuronValueNumbering": 0.0004124641418457031, + "SFKVectorizer": 0.0027742385864257813, + "SimpleAllReduceTiling": 0.000209808349609375, + "SimplifyNeuronTensor": 0.00040721893310546875, + "SpillPSum": 0.0009286403656005859, + "WeightCoalescing": 0.0002105236053466797 + } + }, + "sg00": { + "hilo": { + "ArithmeticIntensity": 0.9200255870819092, + "HloMacCount": 3802996736.0, + "Traffic": 8267154432.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 1.4457588195800781, + "AffinePredicateResolution": 0.05167531967163086, + "AliasDependencyElimination": 0.0026276111602783203, + "AliasDependencyInduction": 0.44934630393981934, + "AliasDependencyReset": 1.2677826881408691, + "BFComputeCutting": 0.06423807144165039, + "BirCodeGenLoop": 2.421293258666992, + "CCOpFusion": 0.41050028800964355, + "CanonicalizeDAGForPGTiling": 0.21233797073364258, + "CanonicalizeIR": 0.06626629829406738, + "CoalesceCCOp": 0.1906270980834961, + "CommuteConcat": 0.03319668769836426, + "DMALocalityOpt": 0.03487658500671387, + "DMAProfiler": 0.08790731430053711, + "DMATilingProfiler": 0.07109546661376953, + "DataLocalityOpt": 1.910703182220459, + "DataStreaming": 0.15360140800476074, + "DeConcat": 0.012087583541870117, + "DeadCodeElimination": 0.035611867904663086, + "DeadStoreElimination": 0.37193870544433594, + "DelinearIndices": 0.2894127368927002, + "Delinearization": 0.1295926570892334, + "DoNothing": 6.914138793945313e-05, + "DramToDramTranspose": 1.0679569244384766, + "DumpGraphAndMetadata": 0.24142217636108398, + "EliminateDivs": 0.17337489128112793, + "ExpandBatchNorm": 0.06027984619140625, + "ExpandISAMacro": 0.09040713310241699, + "FactorizeBlkDims": 0.24898743629455566, + "FactorizeThreadAxesInFreeDims": 0.03613853454589844, + "FlattenMacroLoop": 0.26774168014526367, + "GenericAccessSimplifier": 0.03175926208496094, + "InferInitValue": 1.029360294342041, + "InferIntrinsicOnCC": 0.34307408332824707, + "InferNeuronTensor": 1.7935998439788818, + "InferNonlocalTensors": 3.6307339668273926, + "InferPSumTensor": 0.977715015411377, + "InlineNativeKernels": 0.05374264717102051, + "InsertIOTransposes": 1.162278652191162, + "InsertLocalTransposes": 1.0349645614624023, + "InsertOffloadedTransposes": 0.0943443775177002, + "LICM": 0.1061861515045166, + "LateLegalizeInst": 0.22713756561279297, + "LateLegalizePostSplit": 0.09247255325317383, + "LateLowerReshapeOp": 0.04053616523742676, + "LateLowerTensorOp": 0.3356895446777344, + "LateNeuronInstComb": 0.45120882987976074, + "LayoutPreprocessing": 0.9441671371459961, + "LayoutPreprocessingAndAnalysis": 1.2680203914642334, + "LayoutRequirementAnalysis": 0.309098482131958, + "LegalizeCCOpLayout": 0.07318258285522461, + "LegalizeOpLevelAlias": 0.03343796730041504, + "LegalizePartitionReduce": 0.034781694412231445, + "LegalizeSundaAccess": 1.4543089866638184, + "LegalizeSundaMacro": 0.37755250930786133, + "LegalizeType": 0.20833444595336914, + "LocalLayoutOpt": 0.36218762397766113, + "LoopFusion": 0.31240200996398926, + "LoopSplitting": 0.013066768646240234, + "LowerBroadcast": 0.04690980911254883, + "LowerCCOpBlockAxis": 0.23094987869262695, + "LowerComplexBroadcast": 0.15572404861450195, + "LowerIntrinsics": 1.2286322116851807, + "LowerTensorOp": 0.4897449016571045, + "LowerTranspose": 0.39931535720825195, + "MacroGeneration": 2.335334062576294, + "MaskPropagation": 0.14433836936950684, + "MemcpyElimination": 3.9867260456085205, + "MutateDataType": 0.04344511032104492, + "NeuronAliasDependencyInduction": 0.025929927825927734, + "NeuronAliasDependencyReset": 0.04254412651062012, + "NeuronInstComb": 0.19301342964172363, + "NeuronLICM": 0.2890663146972656, + "NeuronLoopFusion": 0.4089043140411377, + "NeuronLoopInterchange": 0.04476189613342285, + "NeuronSimplifier": 0.30055856704711914, + "NeuronSimplifyPredicates": 0.1793985366821289, + "NeuronValueNumbering": 0.10622000694274902, + "OptimizeAliasedCopyChain": 0.01511383056640625, + "OptimizeNKIKernels": 0.4606451988220215, + "PAGLayoutOpt": 26.32272720336914, + "PComputeCutting": 0.302201509475708, + "PGLayoutTilingPipeline": 38.88710403442383, + "PGTiling": 4.423768043518066, + "PadElimination": 0.008622884750366211, + "ParAxesAnnotation": 25.272018432617188, + "PartialLoopFusion": 0.2368309497833252, + "PartialSimdFusion": 0.20722246170043945, + "PerfectLoopNest": 0.06273055076599121, + "RecognizeOpIdiom": 0.20455479621887207, + "Recompute": 0.00649714469909668, + "RelaxPredicates": 0.154876708984375, + "Rematerialization": 0.16764259338378906, + "ReshapeWeights": 0.021569013595581055, + "ResolveAccessConflict": 0.24012255668640137, + "ResolveComplicatePredicates": 0.05034017562866211, + "RewriteReplicationMatmul": 0.04589343070983887, + "RewriteWeights": 0.05840659141540527, + "SFKVectorizer": 3.119982957839966, + "SimpleAllReduceTiling": 0.06573367118835449, + "Simplifier": 0.11366057395935059, + "SimplifyMacroPredicates": 0.18840670585632324, + "SimplifyNeuronTensor": 1.3295373916625977, + "SimplifySlice": 0.03386688232421875, + "SimplifyTensor": 0.21405529975891113, + "SpillPSum": 0.5431830883026123, + "SplitAPUnionSets": 0.3313255310058594, + "SplitAccGrp": 0.03839588165283203, + "StaticProfiler": 0.13296246528625488, + "StaticTransposeLocalTensor": 0.21724367141723633, + "SundaISel": 1.6302134990692139, + "TCTransform": 0.03438615798950195, + "TensorInitialization": 0.13414645195007324, + "TensorOpSimplifier": 0.27712535858154297, + "TensorOpTransform": 0.8646912574768066, + "TileCCOps": 0.263721227645874, + "TilingProfiler": 0.39296984672546387, + "TransformConvOp": 0.06336498260498047, + "TritiumFusion": 1.0901517868041992, + "ValueNumbering": 0.09328150749206543, + "VectorizeDMA": 0.03394460678100586, + "VectorizeMatMult": 0.0209348201751709, + "WeightCoalescing": 0.05463147163391113, + "ZeroSizeTensorElimination": 0.0004336833953857422 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 262321, + "StaticProfiler::AifUb": 10.559271812438965, + "StaticProfiler::ArithmeticIntensityTensorizer": 11.613152503967285, + "StaticProfiler::AverageDmaLength": 6652.8759765625, + "StaticProfiler::AverageFractalPeUtilization": 99.8321762084961, + "StaticProfiler::AveragePartitionUtilization": 99.3888168334961, + "StaticProfiler::AveragePeUtilization": 99.65400695800781, + "StaticProfiler::DDRTransferBytes": 7587185496, + "StaticProfiler::InternalTransferBytes": 632323092, + "StaticProfiler::LoadExpanded": 1033407, + "StaticProfiler::LocalizationEfficiency": 109.9806137084961, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 110.06793212890625, + "StaticProfiler::StoreExpanded": 3422, + "StaticProfiler::TotalDMAExpanded": 1036829, + "StaticProfiler::TotalDynamicInstancesCount": 275548, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 274994, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 79, + "TilingProfiler::MatMultInstructionsAfterTiling": 231408, + "TilingProfiler::NumPfTransposes": 398, + "TilingProfiler::NumPfTransposesForIo": 37, + "TilingProfiler::NumPfTransposesForLocal": 216, + "TilingProfiler::NumPfTransposesForNonlocal": 145, + "TilingProfiler::PfTransposeInstructions": 19513, + "TilingProfiler::PfTransposeInstructionsForIo": 19152, + "TilingProfiler::PfTransposeInstructionsForLocal": 216, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 145, + "TilingProfiler::ReduceInstructionsAfterTiling": 74, + "TilingProfiler::SimdInstructionsAfterTiling": 2999, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + } +} \ No newline at end of file diff --git a/token_generation_model/_tp0_bk0/graph.neff b/token_generation_model/_tp0_bk0/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..972c91a0d0d805310ba7367f6a8614353443f945 --- /dev/null +++ b/token_generation_model/_tp0_bk0/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82be447a0a308a6e83990d1f3d193b4dc43ab835b136e7c27647ecf6cde94383 +size 6001664 diff --git a/token_generation_model/_tp0_bk0/log-neuron-cc.txt b/token_generation_model/_tp0_bk0/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..ae413cac93a73eda2002e750e78bb72b3aa6c73d --- /dev/null +++ b/token_generation_model/_tp0_bk0/log-neuron-cc.txt @@ -0,0 +1,2573 @@ +2025-08-07T13:50:22Z INFO 46994 [root]: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.hlo_module.pb --output /home/ubuntu/qwen3/token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma' --lnc=1 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=/home/ubuntu/qwen3/token_generation_model/_tp0_bk0/log-neuron-cc.txt --enable-internal-neff-wrapper --verbose=35 +2025-08-07T13:50:22Z INFO 46994 [root]: NeuronX Compiler version 2.20.9961.0+0acef03a Python version 3.10.12 HWM version 2.20.0.9961+0acef03a NumPy version 1.26.4 Running on AMI ami-040348201d80b58ad Running in region usw2-az4 +2025-08-07T13:50:22Z INFO 47058 [root]: XLA detected +2025-08-07T13:50:22Z INFO 47058 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-08-07T13:50:23Z INFO 47058 [root]: Intermediate files stored in /home/ubuntu/qwen3/token_generation_model/_tp0_bk0/neuronxcc-ykq_7n9z, output in /home/ubuntu/qwen3/token_generation_model/_tp0_bk0 +2025-08-07T13:50:23Z INFO 47058 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-08-07T13:50:23Z INFO 47058 [pipeline.Pipeline.0]: Processing input #0 +2025-08-07T13:50:23Z INFO 47058 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-08-07T13:50:23Z INFO 47058 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-08-07T13:50:23Z INFO 47058 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-08-07T13:50:23Z INFO 47058 [job.HLOToTensorizer.0]: Processing input #0 +2025-08-07T13:50:23Z INFO 47058 [job.HLOToTensorizer.0]: IR signature: 1b84de1c7109b93d3bf677f50a6adfce9d88aab86c7f512a7234c08cd856732f for model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.hlo_module.pb +2025-08-07T13:50:23Z INFO 47058 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --layers-per-module=1 --emit-tensor-level-dropout-ops --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-08-07T13:50:23Z INFO 47058 [job.HLOToTensorizer.0]: DEBUG: needsModular? No. macCnt 3803070528 num non-trivial Ops 3790 +INFO: Switching to single-module compile. PrePartitionPipe skipped. +INFO: Found memory bound graph +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 2 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 3802996736 +INFO: Traffic has found 8267154541 +INFO: AIF 0.920026 +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call divide dot exponential gather get-tuple-element iota maximum multiply negate pad parameter reduce reshape rng scatter select sine slice subtract transpose tuple +Warning: Could not open file debug_info_hlo_partitions.json +2025-08-07 13:50:23.216940: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:52] Truncating long HLO operator name %tuple.13243 = tuple(%reshape.4825, %scatter.12247, %scatter.12262, %scatter.12275, %scatter.12290, %scatter.12303, %scatter.12318, %scatter.12331, %scatter.12346, %scatter.12359, %scatter.12374, %scatter.12387, %scatter.12402, %scatter.12415, %scatter.12430, %scatter.12443, %scatter.12458, %scatter.12471, %scatter.12486, %scatter.12499, %scatter.12514, %scatter.12527, %scatter.12542, %scatter.12555, %scatter.12570, %scatter.12583, %scatter.12598, %scatter.12611, %scatter.12626, %scatter.12639, %scatter.... to 512 characters in the compiler's debug metadata +Transposable weight idxs: 76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474 +Invoking RemoveOptimizationBarriers pass + +2025-08-07T13:50:23Z INFO 47058 [job.HLOToTensorizer.0]: IR signature: 7238a055a0d23cd25fb386dbd049f9c2ed4801d02b78ba29302b97418a4b62c9 for sg0000/HLOToTensorizer +2025-08-07T13:50:25Z INFO 47058 [job.HLOToTensorizer.0]: Job #0 finished +2025-08-07T13:50:26Z INFO 47058 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-08-07T13:50:26Z INFO 47058 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-08-07T13:50:26Z INFO 47058 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-08-07T13:50:26Z INFO 47058 [job.Frontend.0]: Processing input #0 +2025-08-07T13:50:26Z INFO 47058 [job.Frontend.0]: Start model loading +2025-08-07T13:50:26Z INFO 47058 [job.Frontend.0]: Start tensorization +2025-08-07T13:50:27Z INFO 47058 [job.Frontend.0]: Num jobs: 1 +2025-08-07T13:50:27Z USER 47058 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-08-07T13:50:27Z INFO 47058 [Tensorizer]: Frontend did not find netlist info. Switching to flat flow. +2025-08-07T13:50:27Z INFO 47058 [Tensorizer]: Building model from Penguin script "penguin.py"... +2025-08-07T13:50:27Z INFO 47058 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --keep-remat-dma-transpose --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-08-07T13:50:28Z INFO 47058 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:50:28Z INFO 47058 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:50:28Z INFO 47058 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:50:28Z INFO 47058 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-08-07T13:50:28Z INFO 47058 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-08-07T13:50:29Z INFO 47058 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.033 seconds +2025-08-07T13:50:29Z INFO 47058 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-08-07T13:50:29Z INFO 47058 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-08-07T13:50:30Z INFO 47058 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.015 seconds +2025-08-07T13:50:30Z INFO 47058 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:50:30Z INFO 47058 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:50:31Z INFO 47058 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.106 seconds +2025-08-07T13:50:31Z INFO 47058 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-08-07T13:50:31Z INFO 47058 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-08-07T13:50:32Z INFO 47058 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.063 seconds +2025-08-07T13:50:32Z INFO 47058 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-08-07T13:50:32Z INFO 47058 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-08-07T13:50:33Z INFO 47058 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.490 seconds +2025-08-07T13:50:33Z INFO 47058 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:50:33Z INFO 47058 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:50:33Z INFO 47058 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:50:34Z INFO 47058 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.004 seconds +2025-08-07T13:50:34Z INFO 47058 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:50:34Z INFO 47058 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:50:35Z INFO 47058 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.340 seconds +2025-08-07T13:50:36Z INFO 47058 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 2.049 seconds +2025-08-07T13:50:36Z INFO 47058 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-08-07T13:50:36Z INFO 47058 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-08-07T13:50:37Z INFO 47058 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.277 seconds +2025-08-07T13:50:37Z INFO 47058 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-08-07T13:50:37Z INFO 47058 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-08-07T13:50:38Z INFO 47058 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.066 seconds +2025-08-07T13:50:38Z INFO 47058 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-08-07T13:50:38Z INFO 47058 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=True) +2025-08-07T13:50:39Z INFO 47058 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.073 seconds +2025-08-07T13:50:39Z INFO 47058 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-08-07T13:50:39Z INFO 47058 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-08-07T13:50:40Z INFO 47058 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.050 seconds +2025-08-07T13:50:40Z INFO 47058 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-08-07T13:50:40Z INFO 47058 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-08-07T13:50:41Z INFO 47058 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.052 seconds +2025-08-07T13:50:41Z INFO 47058 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:50:41Z INFO 47058 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:50:42Z INFO 47058 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.190 seconds +2025-08-07T13:50:42Z INFO 47058 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:50:42Z INFO 47058 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:50:43Z INFO 47058 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.050 seconds +2025-08-07T13:50:43Z INFO 47058 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:50:43Z INFO 47058 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:50:44Z INFO 47058 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.730 seconds +2025-08-07T13:50:44Z INFO 47058 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:50:44Z INFO 47058 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:50:45Z INFO 47058 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.049 seconds +2025-08-07T13:50:45Z INFO 47058 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:50:45Z INFO 47058 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:50:46Z INFO 47058 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.051 seconds +2025-08-07T13:50:46Z INFO 47058 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:50:46Z INFO 47058 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:50:47Z INFO 47058 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.052 seconds +2025-08-07T13:50:47Z INFO 47058 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-08-07T13:50:47Z INFO 47058 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-08-07T13:50:48Z INFO 47058 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.060 seconds +2025-08-07T13:50:48Z INFO 47058 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:50:48Z INFO 47058 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:50:49Z INFO 47058 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.052 seconds +2025-08-07T13:50:49Z INFO 47058 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:50:49Z INFO 47058 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:50:50Z INFO 47058 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.173 seconds +2025-08-07T13:50:50Z INFO 47058 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:50:50Z INFO 47058 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:50:51Z INFO 47058 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.050 seconds +2025-08-07T13:50:51Z INFO 47058 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-08-07T13:50:52Z INFO 47058 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-08-07T13:50:52Z INFO 47058 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.865 seconds +2025-08-07T13:50:52Z INFO 47058 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-08-07T13:50:53Z INFO 47058 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-08-07T13:50:54Z INFO 47058 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.336 seconds +2025-08-07T13:50:54Z INFO 47058 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:50:54Z INFO 47058 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:50:54Z INFO 47058 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:50:55Z INFO 47058 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.005 seconds +2025-08-07T13:50:55Z INFO 47058 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:50:55Z INFO 47058 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:50:55Z INFO 47058 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.449 seconds +2025-08-07T13:50:55Z INFO 47058 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 1.268 seconds +2025-08-07T13:50:55Z INFO 47058 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-08-07T13:50:59Z INFO 47058 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-08-07T13:50:59Z INFO 47058 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 3.987 seconds +2025-08-07T13:50:59Z INFO 47058 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:51:01Z INFO 47058 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:51:01Z INFO 47058 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 1.929 seconds +2025-08-07T13:51:01Z INFO 47058 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-08-07T13:51:01Z INFO 47058 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-08-07T13:51:01Z INFO 47058 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.168 seconds +2025-08-07T13:51:01Z INFO 47058 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:51:01Z INFO 47058 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:51:01Z INFO 47058 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.364 seconds +2025-08-07T13:51:01Z INFO 47058 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:51:02Z INFO 47058 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:51:02Z INFO 47058 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.563 seconds +2025-08-07T13:51:02Z INFO 47058 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:51:03Z INFO 47058 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=True) +2025-08-07T13:51:03Z INFO 47058 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 1.291 seconds +2025-08-07T13:51:03Z INFO 47058 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:51:03Z INFO 47058 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:51:03Z INFO 47058 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.118 seconds +2025-08-07T13:51:03Z INFO 47058 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:51:04Z INFO 47058 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:51:04Z INFO 47058 [sg0000/Tensorizer/LICM]: LICM finished after 0.081 seconds +2025-08-07T13:51:04Z INFO 47058 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:51:04Z INFO 47058 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:51:04Z INFO 47058 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.130 seconds +2025-08-07T13:51:04Z INFO 47058 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:51:04Z INFO 47058 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:51:04Z INFO 47058 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.578 seconds +2025-08-07T13:51:04Z INFO 47058 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-08-07T13:51:04Z INFO 47058 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-08-07T13:51:04Z INFO 47058 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.034 seconds +2025-08-07T13:51:04Z INFO 47058 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:51:04Z INFO 47058 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:51:04Z INFO 47058 [sg0000/Tensorizer/LICM]: LICM finished after 0.062 seconds +2025-08-07T13:51:04Z INFO 47058 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.231 seconds +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.122 seconds +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/LICM]: LICM finished after 0.059 seconds +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.009 seconds +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.129 seconds +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.312 seconds +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.031 seconds +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.112 seconds +2025-08-07T13:51:05Z INFO 47058 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/LICM]: LICM finished after 0.060 seconds +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.093 seconds +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.034 seconds +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.033 seconds +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.205 seconds +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.124 seconds +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.372 seconds +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.006 seconds +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.035 seconds +2025-08-07T13:51:06Z INFO 47058 [Tensorizer]: After optimization: 1185 statements +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:51:06Z INFO 47058 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.043 seconds +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.032 seconds +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.114 seconds +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=8192 is not above min_allgather_tile_size_in_bytes=8388608` +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/TileCCOps]: in bfloat16 (4096,) %'all_gather.1' = AllGatherOp-502 AllGather_add(bfloat16 (2048,) %'gather.1', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((4096,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.47 | hlo_id: 47 | , id = 502 +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/TileCCOps]: in float32 (512,) %'all_gather.2' = AllGatherOp-9601 AllGather_add(float32 (256,) %'add.217', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.12078 | hlo_id: 12078 | , id = 9601 +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=2048 is not above min_allgather_tile_size_in_bytes=8388608` +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/TileCCOps]: in uint32 (512,) %'all_gather.3' = AllGatherOp-9617 AllGather_add(uint32 (256,) %'add.218', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.12213 | hlo_id: 12213 | , id = 9617 +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/TileCCOps]: Finished (changed=False) +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.264 seconds +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.347 seconds +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.123 seconds +2025-08-07T13:51:07Z INFO 47058 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:51:08Z INFO 47058 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:51:08Z INFO 47058 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.274 seconds +2025-08-07T13:51:08Z INFO 47058 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:51:08Z INFO 47058 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:51:08Z INFO 47058 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.036 seconds +2025-08-07T13:51:08Z INFO 47058 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-08-07T13:51:08Z INFO 47058 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-08-07T13:51:08Z INFO 47058 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.041 seconds +2025-08-07T13:51:08Z INFO 47058 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-08-07T13:51:08Z INFO 47058 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-08-07T13:51:08Z INFO 47058 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.343 seconds +2025-08-07T13:51:08Z INFO 47058 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-08-07T13:51:08Z INFO 47058 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=True) +2025-08-07T13:51:08Z INFO 47058 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.240 seconds +2025-08-07T13:51:08Z INFO 47058 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:51:08Z INFO 47058 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:51:09Z INFO 47058 [sg0000/Tensorizer/LICM]: LICM finished after 0.068 seconds +2025-08-07T13:51:09Z INFO 47058 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-08-07T13:51:09Z INFO 47058 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=False) +2025-08-07T13:51:09Z INFO 47058 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.362 seconds +2025-08-07T13:51:09Z INFO 47058 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:51:09Z INFO 47058 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:51:09Z INFO 47058 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.289 seconds +2025-08-07T13:51:09Z INFO 47058 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-08-07T13:51:09Z INFO 47058 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:51:09Z INFO 47058 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:51:09Z INFO 47058 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.241 seconds +2025-08-07T13:51:09Z INFO 47058 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-08-07T13:51:09Z INFO 47058 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-08-07T13:51:10Z INFO 47058 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:51:10Z INFO 47058 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:51:10Z INFO 47058 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.130 seconds +2025-08-07T13:51:10Z INFO 47058 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-08-07T13:51:10Z INFO 47058 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.944 seconds +2025-08-07T13:51:10Z INFO 47058 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-08-07T13:51:11Z INFO 47058 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.309 seconds +2025-08-07T13:51:11Z INFO 47058 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 1.268 seconds +2025-08-07T13:51:11Z INFO 47058 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-08-07T13:51:11Z INFO 47058 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:51:12Z INFO 47058 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:51:14Z INFO 47058 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-08-07T13:51:14Z INFO 47058 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 3.631 seconds +2025-08-07T13:51:14Z INFO 47058 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-08-07T13:51:14Z INFO 47058 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-08-07T13:51:15Z INFO 47058 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-08-07T13:51:40Z INFO 47058 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-08-07T13:51:40Z INFO 47058 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 25.272 seconds +2025-08-07T13:51:40Z INFO 47058 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-08-07T13:51:41Z INFO 47058 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-08-07T13:51:41Z INFO 47058 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 1.035 seconds +2025-08-07T13:51:41Z INFO 47058 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 26.323 seconds +2025-08-07T13:51:41Z INFO 47058 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:51:41Z INFO 47058 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:51:41Z INFO 47058 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.144 seconds +2025-08-07T13:51:41Z INFO 47058 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-08-07T13:51:41Z INFO 47058 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-08-07T13:51:41Z INFO 47058 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.212 seconds +2025-08-07T13:51:41Z INFO 47058 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:51:41Z INFO 47058 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:51:41Z INFO 47058 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.231 seconds +2025-08-07T13:51:41Z INFO 47058 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-08-07T13:51:41Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9858 of IO tensor {'CrossPassTensor': ''}bfloat16 %input4|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(1, 'AG2151'), (188, 'AG2146'), (80, 'AG2149')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10135 of IO tensor {'CrossPassTensor': ''}bfloat16 %input6|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(3, 'AG2159'), (188, 'AG2146'), (82, 'AG2157')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10386 of IO tensor {'CrossPassTensor': ''}bfloat16 %input8|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(5, 'AG2166'), (188, 'AG2146'), (84, 'AG2164')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10637 of IO tensor {'CrossPassTensor': ''}bfloat16 %input10|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(7, 'AG2173'), (188, 'AG2146'), (86, 'AG2171')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10888 of IO tensor {'CrossPassTensor': ''}bfloat16 %input12|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(9, 'AG2180'), (188, 'AG2146'), (88, 'AG2178')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11139 of IO tensor {'CrossPassTensor': ''}bfloat16 %input14|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(11, 'AG2187'), (188, 'AG2146'), (90, 'AG2185')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11390 of IO tensor {'CrossPassTensor': ''}bfloat16 %input16|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(13, 'AG2194'), (188, 'AG2146'), (92, 'AG2192')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11641 of IO tensor {'CrossPassTensor': ''}bfloat16 %input18|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(15, 'AG2201'), (188, 'AG2146'), (94, 'AG2199')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11892 of IO tensor {'CrossPassTensor': ''}bfloat16 %input20|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(17, 'AG2208'), (188, 'AG2146'), (96, 'AG2206')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12143 of IO tensor {'CrossPassTensor': ''}bfloat16 %input22|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(19, 'AG2215'), (188, 'AG2146'), (98, 'AG2213')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12394 of IO tensor {'CrossPassTensor': ''}bfloat16 %input24|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(21, 'AG2222'), (188, 'AG2146'), (100, 'AG2220')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12645 of IO tensor {'CrossPassTensor': ''}bfloat16 %input26|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(23, 'AG2229'), (188, 'AG2146'), (102, 'AG2227')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12896 of IO tensor {'CrossPassTensor': ''}bfloat16 %input28|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(25, 'AG2236'), (188, 'AG2146'), (104, 'AG2234')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13147 of IO tensor {'CrossPassTensor': ''}bfloat16 %input30|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(27, 'AG2243'), (188, 'AG2146'), (106, 'AG2241')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13398 of IO tensor {'CrossPassTensor': ''}bfloat16 %input32|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(29, 'AG2250'), (188, 'AG2146'), (108, 'AG2248')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13649 of IO tensor {'CrossPassTensor': ''}bfloat16 %input34|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(31, 'AG2257'), (188, 'AG2146'), (110, 'AG2255')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13900 of IO tensor {'CrossPassTensor': ''}bfloat16 %input36|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(33, 'AG2264'), (188, 'AG2146'), (112, 'AG2262')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14151 of IO tensor {'CrossPassTensor': ''}bfloat16 %input38|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(35, 'AG2271'), (188, 'AG2146'), (114, 'AG2269')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14402 of IO tensor {'CrossPassTensor': ''}bfloat16 %input40|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(37, 'AG2278'), (188, 'AG2146'), (116, 'AG2276')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14653 of IO tensor {'CrossPassTensor': ''}bfloat16 %input42|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(39, 'AG2285'), (188, 'AG2146'), (118, 'AG2283')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14904 of IO tensor {'CrossPassTensor': ''}bfloat16 %input44|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(41, 'AG2292'), (188, 'AG2146'), (120, 'AG2290')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15155 of IO tensor {'CrossPassTensor': ''}bfloat16 %input46|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(43, 'AG2299'), (188, 'AG2146'), (122, 'AG2297')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15406 of IO tensor {'CrossPassTensor': ''}bfloat16 %input48|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(45, 'AG2306'), (188, 'AG2146'), (124, 'AG2304')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15657 of IO tensor {'CrossPassTensor': ''}bfloat16 %input50|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(47, 'AG2313'), (188, 'AG2146'), (126, 'AG2311')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15908 of IO tensor {'CrossPassTensor': ''}bfloat16 %input52|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(49, 'AG2320'), (188, 'AG2146'), (128, 'AG2318')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16159 of IO tensor {'CrossPassTensor': ''}bfloat16 %input54|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(51, 'AG2327'), (188, 'AG2146'), (130, 'AG2325')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16410 of IO tensor {'CrossPassTensor': ''}bfloat16 %input56|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(53, 'AG2334'), (188, 'AG2146'), (132, 'AG2332')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16661 of IO tensor {'CrossPassTensor': ''}bfloat16 %input58|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(55, 'AG2341'), (188, 'AG2146'), (134, 'AG2339')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16912 of IO tensor {'CrossPassTensor': ''}bfloat16 %input60|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(57, 'AG2348'), (188, 'AG2146'), (136, 'AG2346')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17163 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(59, 'AG2355'), (188, 'AG2146'), (138, 'AG2353')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17414 of IO tensor {'CrossPassTensor': ''}bfloat16 %input64|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(61, 'AG2362'), (188, 'AG2146'), (140, 'AG2360')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17665 of IO tensor {'CrossPassTensor': ''}bfloat16 %input66|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(63, 'AG2369'), (188, 'AG2146'), (142, 'AG2367')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17916 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(65, 'AG2376'), (188, 'AG2146'), (144, 'AG2374')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18167 of IO tensor {'CrossPassTensor': ''}bfloat16 %input70|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(67, 'AG2383'), (188, 'AG2146'), (146, 'AG2381')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18418 of IO tensor {'CrossPassTensor': ''}bfloat16 %input72|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(69, 'AG2390'), (188, 'AG2146'), (148, 'AG2388')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18669 of IO tensor {'CrossPassTensor': ''}bfloat16 %input74|NHWC|(1, 4, 8, 128, 2, 64) is not sorted, index list (w/ AG ids): [(71, 'AG2397'), (188, 'AG2146'), (150, 'AG2395')] +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 1.446 seconds +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.217 seconds +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.302 seconds +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.064 seconds +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.013 seconds +2025-08-07T13:51:43Z INFO 47058 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-08-07T13:51:46Z INFO 47058 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-08-07T13:51:46Z INFO 47058 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 2.335 seconds +2025-08-07T13:51:46Z INFO 47058 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 4.424 seconds +2025-08-07T13:51:46Z INFO 47058 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-08-07T13:51:47Z INFO 47058 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-08-07T13:51:47Z INFO 47058 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 1.162 seconds +2025-08-07T13:51:47Z INFO 47058 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-08-07T13:51:47Z INFO 47058 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-08-07T13:51:47Z INFO 47058 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.094 seconds +2025-08-07T13:51:47Z INFO 47058 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 1.068 seconds +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 38.887 seconds +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 19008: transpose_128x128 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 19008: matmul_128x128x1 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.393 seconds +2025-08-07T13:51:48Z INFO 47058 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:51:49Z INFO 47058 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:51:49Z INFO 47058 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.431 seconds +2025-08-07T13:51:49Z INFO 47058 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-08-07T13:51:51Z INFO 47058 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-08-07T13:51:51Z INFO 47058 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 1.794 seconds +2025-08-07T13:51:51Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:51:51Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:51:51Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.238 seconds +2025-08-07T13:51:51Z INFO 47058 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:51:51Z INFO 47058 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:51:51Z INFO 47058 [sg0000/Tensorizer/LICM]: LICM finished after 0.096 seconds +2025-08-07T13:51:51Z INFO 47058 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-08-07T13:51:51Z INFO 47058 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-08-07T13:51:51Z INFO 47058 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.046 seconds +2025-08-07T13:51:51Z INFO 47058 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:51:51Z INFO 47058 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:51:51Z INFO 47058 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.120 seconds +2025-08-07T13:51:51Z INFO 47058 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:51:51Z INFO 47058 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:51:51Z INFO 47058 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.128 seconds +2025-08-07T13:51:51Z INFO 47058 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 1.911 seconds +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 19008: transpose_128x128 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 19008: matmul_128x128x1 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.071 seconds +2025-08-07T13:51:53Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.289 seconds +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12262 | hlo_id: 12262 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12247 | hlo_id: 12247 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12290 | hlo_id: 12290 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12275 | hlo_id: 12275 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12318 | hlo_id: 12318 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12303 | hlo_id: 12303 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12346 | hlo_id: 12346 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12331 | hlo_id: 12331 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12374 | hlo_id: 12374 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12359 | hlo_id: 12359 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12402 | hlo_id: 12402 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12387 | hlo_id: 12387 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12430 | hlo_id: 12430 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12415 | hlo_id: 12415 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12458 | hlo_id: 12458 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12443 | hlo_id: 12443 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12486 | hlo_id: 12486 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12471 | hlo_id: 12471 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12514 | hlo_id: 12514 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12499 | hlo_id: 12499 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12542 | hlo_id: 12542 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12527 | hlo_id: 12527 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12570 | hlo_id: 12570 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12555 | hlo_id: 12555 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12598 | hlo_id: 12598 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12583 | hlo_id: 12583 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12626 | hlo_id: 12626 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12611 | hlo_id: 12611 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12654 | hlo_id: 12654 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12639 | hlo_id: 12639 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12682 | hlo_id: 12682 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12667 | hlo_id: 12667 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12710 | hlo_id: 12710 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12695 | hlo_id: 12695 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12738 | hlo_id: 12738 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12723 | hlo_id: 12723 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12766 | hlo_id: 12766 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12751 | hlo_id: 12751 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12794 | hlo_id: 12794 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12779 | hlo_id: 12779 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12822 | hlo_id: 12822 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12807 | hlo_id: 12807 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12850 | hlo_id: 12850 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12835 | hlo_id: 12835 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12878 | hlo_id: 12878 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12863 | hlo_id: 12863 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12906 | hlo_id: 12906 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12891 | hlo_id: 12891 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12934 | hlo_id: 12934 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12919 | hlo_id: 12919 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12962 | hlo_id: 12962 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12947 | hlo_id: 12947 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12990 | hlo_id: 12990 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12975 | hlo_id: 12975 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13018 | hlo_id: 13018 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13003 | hlo_id: 13003 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13046 | hlo_id: 13046 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13031 | hlo_id: 13031 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13074 | hlo_id: 13074 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13059 | hlo_id: 13059 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13102 | hlo_id: 13102 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13087 | hlo_id: 13087 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13130 | hlo_id: 13130 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13115 | hlo_id: 13115 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13158 | hlo_id: 13158 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13143 | hlo_id: 13143 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13186 | hlo_id: 13186 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13171 | hlo_id: 13171 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13214 | hlo_id: 13214 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13199 | hlo_id: 13199 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13242 | hlo_id: 13242 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13227 | hlo_id: 13227 | +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.378 seconds +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.302 seconds +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.063 seconds +2025-08-07T13:51:54Z INFO 47058 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:51:55Z INFO 47058 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:51:55Z INFO 47058 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.179 seconds +2025-08-07T13:51:55Z INFO 47058 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-08-07T13:51:55Z INFO 47058 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-08-07T13:51:55Z INFO 47058 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.058 seconds +2025-08-07T13:51:55Z INFO 47058 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-08-07T13:51:55Z INFO 47058 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-08-07T13:51:55Z INFO 47058 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.022 seconds +2025-08-07T13:51:55Z INFO 47058 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:51:55Z INFO 47058 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:51:55Z INFO 47058 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.268 seconds +2025-08-07T13:51:55Z INFO 47058 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:51:55Z INFO 47058 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:51:55Z INFO 47058 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.188 seconds +2025-08-07T13:51:55Z INFO 47058 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-08-07T13:51:56Z INFO 47058 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-08-07T13:51:56Z INFO 47058 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 1.029 seconds +2025-08-07T13:51:56Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:51:57Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:51:57Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.301 seconds +2025-08-07T13:51:57Z INFO 47058 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-08-07T13:51:57Z INFO 47058 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-08-07T13:51:57Z INFO 47058 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.214 seconds +2025-08-07T13:51:57Z INFO 47058 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:51:57Z INFO 47058 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:51:57Z INFO 47058 [sg0000/Tensorizer/LICM]: LICM finished after 0.106 seconds +2025-08-07T13:51:57Z INFO 47058 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-08-07T13:51:58Z INFO 47058 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-08-07T13:51:58Z INFO 47058 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 1.630 seconds +2025-08-07T13:51:58Z INFO 47058 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-08-07T13:51:58Z INFO 47058 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:51:58Z INFO 47058 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.003 seconds +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.026 seconds +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.043 seconds +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=True) +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.156 seconds +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.046 seconds +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.054 seconds +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.409 seconds +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.045 seconds +2025-08-07T13:51:59Z INFO 47058 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:52:00Z INFO 47058 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:52:00Z INFO 47058 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.232 seconds +2025-08-07T13:52:00Z INFO 47058 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:52:00Z INFO 47058 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:52:00Z INFO 47058 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.249 seconds +2025-08-07T13:52:00Z INFO 47058 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:52:01Z INFO 47058 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-08-07T13:52:01Z INFO 47058 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 1.416 seconds +2025-08-07T13:52:01Z INFO 47058 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:52:01Z INFO 47058 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-08-07T13:52:01Z INFO 47058 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.106 seconds +2025-08-07T13:52:01Z INFO 47058 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.193 seconds +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.034 seconds +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.042 seconds +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.035 seconds +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.012 seconds +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.036 seconds +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=False) +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.207 seconds +2025-08-07T13:52:02Z INFO 47058 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-08-07T13:52:03Z INFO 47058 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-08-07T13:52:03Z INFO 47058 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 1.090 seconds +2025-08-07T13:52:03Z INFO 47058 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:52:03Z INFO 47058 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:52:03Z INFO 47058 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.263 seconds +2025-08-07T13:52:03Z INFO 47058 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-08-07T13:52:03Z INFO 47058 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-08-07T13:52:03Z INFO 47058 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.021 seconds +2025-08-07T13:52:03Z INFO 47058 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-08-07T13:52:04Z INFO 47058 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-08-07T13:52:04Z INFO 47058 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.237 seconds +2025-08-07T13:52:04Z INFO 47058 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:52:04Z INFO 47058 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:52:04Z INFO 47058 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.203 seconds +2025-08-07T13:52:04Z INFO 47058 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:52:04Z INFO 47058 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-08-07T13:52:04Z INFO 47058 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.399 seconds +2025-08-07T13:52:04Z INFO 47058 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:52:04Z INFO 47058 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:52:04Z INFO 47058 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.047 seconds +2025-08-07T13:52:04Z INFO 47058 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:52:05Z INFO 47058 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-08-07T13:52:05Z INFO 47058 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.451 seconds +2025-08-07T13:52:05Z INFO 47058 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-08-07T13:52:05Z INFO 47058 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-08-07T13:52:05Z INFO 47058 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.038 seconds +2025-08-07T13:52:05Z INFO 47058 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:52:05Z INFO 47058 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-08-07T13:52:05Z INFO 47058 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.543 seconds +2025-08-07T13:52:05Z INFO 47058 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:52:07Z INFO 47058 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-08-07T13:52:07Z INFO 47058 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 1.229 seconds +2025-08-07T13:52:07Z INFO 47058 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-08-07T13:52:07Z INFO 47058 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-08-07T13:52:07Z INFO 47058 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.054 seconds +2025-08-07T13:52:07Z INFO 47058 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:52:07Z INFO 47058 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-08-07T13:52:07Z INFO 47058 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.208 seconds +2025-08-07T13:52:07Z INFO 47058 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:52:07Z INFO 47058 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:52:07Z INFO 47058 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.289 seconds +2025-08-07T13:52:07Z INFO 47058 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:52:08Z INFO 47058 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-08-07T13:52:08Z INFO 47058 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.978 seconds +2025-08-07T13:52:08Z INFO 47058 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:52:08Z INFO 47058 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:52:08Z INFO 47058 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.055 seconds +2025-08-07T13:52:08Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:52:10Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-08-07T13:52:10Z INFO 47058 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 1.454 seconds +2025-08-07T13:52:10Z INFO 47058 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-08-07T13:52:10Z INFO 47058 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-08-07T13:52:10Z INFO 47058 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.155 seconds +2025-08-07T13:52:10Z INFO 47058 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-08-07T13:52:10Z INFO 47058 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-08-07T13:52:10Z INFO 47058 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.134 seconds +2025-08-07T13:52:10Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:52:10Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:52:10Z INFO 47058 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.179 seconds +2025-08-07T13:52:10Z INFO 47058 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:52:10Z INFO 47058 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:52:10Z INFO 47058 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.090 seconds +2025-08-07T13:52:10Z INFO 47058 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:52:12Z INFO 47058 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=True) +2025-08-07T13:52:12Z INFO 47058 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 1.330 seconds +2025-08-07T13:52:12Z INFO 47058 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:52:12Z INFO 47058 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:52:12Z INFO 47058 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.035 seconds +2025-08-07T13:52:12Z INFO 47058 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:52:12Z INFO 47058 [sg0000/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:52:12Z INFO 47058 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.154 seconds +2025-08-07T13:52:12Z INFO 47058 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 3.120 seconds +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.227 seconds +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.191 seconds +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.066 seconds +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 2.705ms (594.000MiB, est bw: 230.258GB/s, 7.724% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (594, 128, 4096) %'30788.46007'[i4422_0,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': '', 'transposable': True}bfloat16 (75968, 4096) %'input473'[128i4422_0+i0.128,i1.4096] # id=46006, src_id=None, , instances=594 # dl = tensor_op_name: input473_pftranspose_30788 | hlo_id: 18069 | if -128i4422_0-i0.128+75967 >= 0 [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.661% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input84_local_33013'[i148_0,i147_0_0_33017,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': '', 'transposable': True, 'static_io_transpose': {'reshape': (32, 128, 2, 24, 128), 'transpose': [0, 2, 4, 3, 1]}}bfloat16 (32, 2, 128, 3072) %'input84'[i148_0,i147_0_0_33017,i0.128,i1.3072] # id=37070, src_id=None, , instances=64 # dl = tensor_op_name: _dot.407 | hlo_id: 14041 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.661% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input95_local_33088'[i270_0,i269_0_0_33092,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': '', 'transposable': True, 'static_io_transpose': {'reshape': (32, 128, 2, 24, 128), 'transpose': [0, 2, 4, 3, 1]}}bfloat16 (32, 2, 128, 3072) %'input95'[i270_0,i269_0_0_33092,i0.128,i1.3072] # id=37244, src_id=None, , instances=64 # dl = tensor_op_name: _dot.739 | hlo_id: 14156 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.661% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input106_local_33163'[i392_0,i391_0_0_33167,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': '', 'transposable': True, 'static_io_transpose': {'reshape': (32, 128, 2, 24, 128), 'transpose': [0, 2, 4, 3, 1]}}bfloat16 (32, 2, 128, 3072) %'input106'[i392_0,i391_0_0_33167,i0.128,i1.3072] # id=37418, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1071 | hlo_id: 14271 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.661% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input117_local_33238'[i514_0,i513_0_0_33242,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': '', 'transposable': True, 'static_io_transpose': {'reshape': (32, 128, 2, 24, 128), 'transpose': [0, 2, 4, 3, 1]}}bfloat16 (32, 2, 128, 3072) %'input117'[i514_0,i513_0_0_33242,i0.128,i1.3072] # id=37592, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1403 | hlo_id: 14386 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.661% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input128_local_33313'[i636_0,i635_0_0_33317,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': '', 'transposable': True, 'static_io_transpose': {'reshape': (32, 128, 2, 24, 128), 'transpose': [0, 2, 4, 3, 1]}}bfloat16 (32, 2, 128, 3072) %'input128'[i636_0,i635_0_0_33317,i0.128,i1.3072] # id=37766, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1735 | hlo_id: 14501 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.661% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input139_local_33388'[i758_0,i757_0_0_33392,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': '', 'transposable': True, 'static_io_transpose': {'reshape': (32, 128, 2, 24, 128), 'transpose': [0, 2, 4, 3, 1]}}bfloat16 (32, 2, 128, 3072) %'input139'[i758_0,i757_0_0_33392,i0.128,i1.3072] # id=37940, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2067 | hlo_id: 14616 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.661% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input150_local_33463'[i880_0,i879_0_0_33467,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': '', 'transposable': True, 'static_io_transpose': {'reshape': (32, 128, 2, 24, 128), 'transpose': [0, 2, 4, 3, 1]}}bfloat16 (32, 2, 128, 3072) %'input150'[i880_0,i879_0_0_33467,i0.128,i1.3072] # id=38114, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2399 | hlo_id: 14731 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.661% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input161_local_33538'[i1002_0,i1001_0_0_33542,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': '', 'transposable': True, 'static_io_transpose': {'reshape': (32, 128, 2, 24, 128), 'transpose': [0, 2, 4, 3, 1]}}bfloat16 (32, 2, 128, 3072) %'input161'[i1002_0,i1001_0_0_33542,i0.128,i1.3072] # id=38288, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2731 | hlo_id: 14846 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.661% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input172_local_33613'[i1124_0,i1123_0_0_33617,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': '', 'transposable': True, 'static_io_transpose': {'reshape': (32, 128, 2, 24, 128), 'transpose': [0, 2, 4, 3, 1]}}bfloat16 (32, 2, 128, 3072) %'input172'[i1124_0,i1123_0_0_33617,i0.128,i1.3072] # id=38462, src_id=None, , instances=64 # dl = tensor_op_name: _dot.3063 | hlo_id: 14961 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.088 seconds +2025-08-07T13:52:15Z INFO 47058 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.001 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.003 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.005 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.001 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.001 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.001 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-08-07T13:52:16Z INFO 47058 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-08-07T13:52:16Z INFO 47058 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.461 seconds +2025-08-07T13:52:16Z INFO 47058 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:52:16Z INFO 47058 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:52:16Z INFO 47058 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.411 seconds +2025-08-07T13:52:16Z INFO 47058 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-08-07T13:52:16Z WARNING 47058 [sg0000/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 91.34 percent of all matmul computation +2025-08-07T13:52:16Z INFO 47058 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-08-07T13:52:16Z INFO 47058 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.133 seconds +2025-08-07T13:52:16Z INFO 47058 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-08-07T13:52:17Z INFO 47058 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-08-07T13:52:17Z INFO 47058 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.331 seconds +2025-08-07T13:52:17Z INFO 47058 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-08-07T13:52:17Z INFO 47058 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-08-07T13:52:17Z INFO 47058 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.092 seconds +2025-08-07T13:52:17Z INFO 47058 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-08-07T13:52:17Z INFO 47058 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-08-07T13:52:17Z INFO 47058 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.241 seconds +2025-08-07T13:52:17Z INFO 47058 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-08-07T13:52:17Z INFO 47058 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-08-07T13:52:17Z INFO 47058 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-08-07T13:52:17Z INFO 47058 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-08-07T13:52:20Z INFO 47058 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-08-07T13:52:20Z INFO 47058 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 2.421 seconds +2025-08-07T13:52:21Z INFO 47058 [Tensorizer]: BirCodeGen estimate #instances=322914 in sg0000 +2025-08-07T13:52:21Z INFO 47058 [Tensorizer]: IR signature: 0008d2291c40cbc1943864bc884fa81d07373338c5a8542a06f116c7446c8305 for nc00/sg0000/TensorizerBIR +2025-08-07T13:52:21Z INFO 47058 [Tensorizer]: Weights total number of bytes: 4854280 +2025-08-07T13:52:21Z INFO 47058 [Tensorizer]: Successfully built model. +2025-08-07T13:52:21Z USER 47058 [root/Tensorizer/Tensorizer]: Tensorizer finished after 114.786 seconds +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: End tensorization +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input0 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input1 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input2 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input3 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input4 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input5 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input6 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input7 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input8 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input9 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input10 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input11 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input12 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input13 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input14 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input15 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input16 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input17 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input18 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input19 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input20 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input21 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input22 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input23 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input24 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input25 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input26 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input27 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input28 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input29 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input30 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input31 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input32 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input33 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input34 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input35 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input36 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input37 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input38 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input39 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input40 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input41 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input42 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input43 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input44 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input45 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input46 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input47 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input48 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input49 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input50 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input51 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input52 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input53 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input54 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input55 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input56 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input57 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input58 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input59 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input60 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input61 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input62 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input63 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input64 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input65 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input66 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input67 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input68 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input69 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input70 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input71 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input72 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input73 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input74 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input75 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input76 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input77 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input78 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input79 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input80 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input81 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input82 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input83 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input84 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input85 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input86 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input87 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input88 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input89 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input90 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input91 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input92 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input93 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input94 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input95 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input96 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input97 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input98 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input99 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input100 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input101 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input102 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input103 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input104 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input105 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input106 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input107 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input108 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input109 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input110 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input111 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input112 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input113 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input114 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input115 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input116 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input117 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input118 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input119 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input120 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input121 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input122 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input123 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input124 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input125 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input126 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input127 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input128 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input129 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input130 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input131 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input132 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input133 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input134 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input135 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input136 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input137 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input138 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input139 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input140 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input141 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input142 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input143 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input144 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input145 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input146 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input147 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input148 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input149 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input150 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input151 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input152 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input153 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input154 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input155 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input156 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input157 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input158 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input159 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input160 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input161 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input162 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input163 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input164 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input165 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input166 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input167 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input168 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input169 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input170 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input171 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input172 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input173 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input174 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input175 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input176 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input177 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input178 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input179 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input180 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input181 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input182 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input183 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input184 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input185 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input186 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input187 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input188 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input189 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input190 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input191 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input192 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input193 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input194 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input195 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input196 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input197 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input198 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input199 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input200 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input201 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input202 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input203 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input204 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input205 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input206 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input207 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input208 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input209 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input210 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input211 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input212 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input213 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input214 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input215 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input216 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input217 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input218 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input219 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input220 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input221 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input222 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input223 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input224 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input225 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input226 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input227 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input228 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input229 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input230 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input231 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input232 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input233 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input234 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input235 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input236 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input237 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input238 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input239 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input240 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input241 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input242 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input243 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input244 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input245 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input246 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input247 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input248 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input249 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input250 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input251 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input252 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input253 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input254 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input255 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input256 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input257 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input258 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input259 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input260 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input261 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input262 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input263 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input264 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input265 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input266 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input267 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input268 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input269 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input270 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input271 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input272 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input273 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input274 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input275 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input276 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input277 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input278 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input279 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input280 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input281 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input282 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input283 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input284 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input285 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input286 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input287 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input288 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input289 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input290 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input291 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input292 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input293 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input294 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input295 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input296 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input297 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input298 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input299 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input300 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input301 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input302 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input303 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input304 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input305 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input306 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input307 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input308 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input309 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input310 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input311 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input312 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input313 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input314 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input315 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input316 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input317 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input318 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input319 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input320 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input321 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input322 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input323 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input324 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input325 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input326 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input327 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input328 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input329 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input330 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input331 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input332 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input333 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input334 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input335 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input336 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input337 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input338 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input339 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input340 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input341 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input342 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input343 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input344 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input345 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input346 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input347 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input348 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input349 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input350 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input351 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input352 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input353 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input354 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input355 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input356 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input357 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input358 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input359 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input360 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input361 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input362 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input363 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input364 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input365 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input366 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input367 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input368 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input369 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input370 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input371 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input372 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input373 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input374 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input375 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input376 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input377 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input378 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input379 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input380 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input381 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input382 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input383 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input384 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input385 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input386 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input387 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input388 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input389 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input390 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input391 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input392 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input393 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input394 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input395 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input396 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input397 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input398 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input399 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input400 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input401 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input402 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input403 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input404 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input405 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input406 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input407 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input408 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input409 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input410 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input411 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input412 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input413 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input414 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input415 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input416 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input417 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input418 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input419 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input420 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input421 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input422 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input423 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input424 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input425 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input426 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input427 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input428 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input429 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input430 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input431 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input432 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input433 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input434 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input435 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input436 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input437 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input438 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input439 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input440 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input441 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input442 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input443 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input444 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input445 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input446 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input447 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input448 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input449 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input450 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input451 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input452 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input453 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input454 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input455 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input456 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input457 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input458 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input459 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input460 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input461 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input462 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input463 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input464 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input465 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input466 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input467 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input468 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input469 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input470 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input471 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input472 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input473 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Network input: input474 +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: wrote bir.json +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: wrote tensor_map.json +2025-08-07T13:52:21Z INFO 47058 [job.Frontend.0]: Job #0 finished +2025-08-07T13:52:21Z INFO 47058 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-08-07T13:52:21Z INFO 47058 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-08-07T13:52:21Z INFO 47058 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-08-07T13:52:21Z INFO 47058 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-08-07T13:52:21Z INFO 47058 [job.WalrusDriver.0]: BackendDriver has 1 states with 1 core LNC +2025-08-07T13:52:21Z INFO 47058 [job.WalrusDriver.0]: BackendDriver: no partitions found. Switching to flat flow. +2025-08-07T13:52:21Z INFO 47058 [job.WalrusDriver.0]: Job WalrusDriver len(in_states) 1 +2025-08-07T13:52:21Z INFO 47058 [job.WalrusDriver.0]: Processing input #0 +2025-08-07T13:52:21Z INFO 47058 [job.WalrusDriver.0]: BackendDriver in_state.num_states 1 with 1 core LNC +2025-08-07T13:52:21Z INFO 47058 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/token_generation_model/_tp0_bk0/log-neuron-cc.txt --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-color-rotation --max-load-lower-bound 0.14 --mm-reorder-opt --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json --unified-backend-and-legacy-codegen --tensor-map tensor_map.json --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --dge-levels vector_dynamic_offsets,scalar_dynamic_offset,io --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.neff +2025-08-07T13:52:21Z INFO 47058 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/token_generation_model/_tp0_bk0/neuronxcc-ykq_7n9z/sg00 +2025-08-07T13:52:21Z INFO 47058 [job.WalrusDriver.0]: propagate_exit=True +2025-08-07T13:52:21Z INFO 47058 [job.WalrusDriver.0]: use_logger=False +2025-08-07T13:52:21Z INFO 47058 [job.WalrusDriver.0]: expose_stderr=True +2025-08-07T13:52:22Z INFO 47306 [Logging]: Logging to ../../log-neuron-cc.txt at level 'INFO' +2025-08-07T13:52:22Z INFO 47306 [BackendDriver]: max_allowed_parallelism=128 +2025-08-07T13:52:22Z INFO 47306 [BackendDriver]: Backend driver mtBackend: false numModules: 1 Cwd: "/home/ubuntu/qwen3/token_generation_model/_tp0_bk0/neuronxcc-ykq_7n9z/sg00" +2025-08-07T13:52:22Z INFO 47306 [BackendDriver]: DynamicDMA is enabled +2025-08-07T13:52:22Z INFO 47306 [BackendDriver]: DynamicDMA levels being enabled: io, scalar_dynamic_offset, vector_dynamic_offsets, +2025-08-07T13:52:22Z USER 47306 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:52:22Z INFO 47306 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=7295 blocks=1 instructions=7257 Max writers: 191 Max Readers: 475 +2025-08-07T13:52:22Z USER 47306 [ModuleForkPass]: Running do_nothing +2025-08-07T13:52:22Z INFO 47306 [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=7295 blocks=1 instructions=7257 Max writers: 191 Max Readers: 475 +2025-08-07T13:52:22Z USER 47306 [ModuleForkPass]: do_nothing finished after 0.001 seconds +2025-08-07T13:52:22Z INFO 47306 [ModuleForkPass]: curr_vmrss: 211mb, ru_maxrss: 670mb (delta=0mb) +2025-08-07T13:52:22Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 7295 memory location(s), 1 block(s), and 7257 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:52:22Z USER 47306 [ModuleForkPass]: Running birverifier +2025-08-07T13:52:22Z INFO 47306 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=7295 blocks=1 instructions=7257 Max writers: 191 Max Readers: 475 +2025-08-07T13:52:22Z WARNING 47306 [birverifier::InstVisitor]: (module) Non - output memory location with no reader: {convert.357.56865}@SB<0,0>(1x2)#Internal DebugInfo: +2025-08-07T13:52:22Z USER 47306 [ModuleForkPass]: birverifier finished after 0.272 seconds +2025-08-07T13:52:22Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1002mb, ru_maxrss: 1002mb (delta=332mb) +2025-08-07T13:52:22Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 7295 memory location(s), 1 block(s), and 7257 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:52:22Z USER 47306 [BackendPassManager]: mod_parallel_pass finished after 0.278 seconds +2025-08-07T13:52:22Z INFO 47306 [BackendPassManager]: curr_vmrss: 994mb, ru_maxrss: 1002mb (delta=332mb) +2025-08-07T13:52:22Z INFO 47306 [BackendPassManager]: Output has 1 module(s), 1 function(s), 7295 memory location(s), 1 block(s), and 7257 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:52:22Z USER 47306 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:52:22Z INFO 47306 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=7295 blocks=1 instructions=7257 Max writers: 191 Max Readers: 475 +2025-08-07T13:52:22Z USER 47306 [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:52:22Z INFO 47306 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=7295 blocks=1 instructions=7257 Max writers: 191 Max Readers: 475 +2025-08-07T13:52:22Z USER 47306 [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:52:22Z INFO 47306 [SubgraphForkPass]: curr_vmrss: 994mb, ru_maxrss: 1002mb (delta=0mb) +2025-08-07T13:52:22Z INFO 47306 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 7295 memory location(s), 1 block(s), and 7257 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:52:22Z USER 47306 [BackendPassManager]: subgraph_parallel_pass finished after 0.002 seconds +2025-08-07T13:52:22Z INFO 47306 [BackendPassManager]: curr_vmrss: 994mb, ru_maxrss: 1002mb (delta=0mb) +2025-08-07T13:52:22Z INFO 47306 [BackendPassManager]: Output has 1 module(s), 1 function(s), 7295 memory location(s), 1 block(s), and 7257 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:52:22Z USER 47306 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:52:22Z INFO 47306 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=7295 blocks=1 instructions=7257 Max writers: 191 Max Readers: 475 +2025-08-07T13:52:22Z USER 47306 [ModuleForkPass]: Running expand_replication +2025-08-07T13:52:22Z INFO 47306 [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=7295 blocks=1 instructions=7257 Max writers: 191 Max Readers: 475 +2025-08-07T13:52:22Z INFO 47306 [ExpandReplication]: Found 0 replicated matmults +2025-08-07T13:52:22Z USER 47306 [ModuleForkPass]: expand_replication finished after 0.001 seconds +2025-08-07T13:52:22Z INFO 47306 [ModuleForkPass]: curr_vmrss: 994mb, ru_maxrss: 1002mb (delta=0mb) +2025-08-07T13:52:22Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 7295 memory location(s), 1 block(s), and 7257 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:52:22Z USER 47306 [ModuleForkPass]: Running unroll +2025-08-07T13:52:22Z INFO 47306 [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=7295 blocks=1 instructions=7257 Max writers: 191 Max Readers: 475 +2025-08-07T13:52:22Z INFO 47306 [Unroll]: INFO (Unroll) Start unrolling at Thu Aug 7 13:52:22 2025 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: INFO (Unroll) DONE unrolling Thu Aug 7 13:52:22 2025 + +2025-08-07T13:52:25Z INFO 47306 [Unroll]: sg0000 Instruction count after Unroll: +2025-08-07T13:52:25Z INFO 47306 [Unroll]: Total count: 277097 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: Matmult: 251660 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: GenericCopy: 11565 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: Load: 8259 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: TensorScalarPtr: 1482 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: TensorTensor: 1125 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: Save: 682 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: Activation: 545 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: Memset: 299 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: Max: 224 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: MaxIndex: 224 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: StreamShuffle: 222 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: MatchReplace: 217 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: TensorReduce: 151 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: CollectiveCompute: 75 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: Reciprocal: 75 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: DMACopy: 74 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: Iota: 73 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: StreamTranspose: 72 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: Select: 38 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: Gather: 35 +2025-08-07T13:52:25Z INFO 47306 [Unroll]: Unrolled DGE count with Dynamic AP: 73 +2025-08-07T13:52:25Z USER 47306 [ModuleForkPass]: unroll finished after 2.563 seconds +2025-08-07T13:52:25Z INFO 47306 [ModuleForkPass]: curr_vmrss: 2379mb, ru_maxrss: 2379mb (delta=1377mb) +2025-08-07T13:52:25Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28306 memory location(s), 1 block(s), and 277097 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:25Z USER 47306 [BackendPassManager]: mod_parallel_pass finished after 2.613 seconds +2025-08-07T13:52:25Z INFO 47306 [BackendPassManager]: curr_vmrss: 1481mb, ru_maxrss: 2379mb (delta=1377mb) +2025-08-07T13:52:25Z INFO 47306 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28306 memory location(s), 1 block(s), and 277097 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:25Z USER 47306 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:52:25Z INFO 47306 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=28306 blocks=1 instructions=277097 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:25Z USER 47306 [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:52:25Z INFO 47306 [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=28306 blocks=1 instructions=277097 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:25Z INFO 47306 [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:52:25Z INFO 47306 [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:52:25Z INFO 47306 [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:52:25Z INFO 47306 [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:52:25Z USER 47306 [SubgraphForkPass]: dead_code_elim finished after 0.288 seconds +2025-08-07T13:52:25Z INFO 47306 [SubgraphForkPass]: curr_vmrss: 1489mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:25Z INFO 47306 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 27829 memory location(s), 1 block(s), and 277096 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:25Z USER 47306 [BackendPassManager]: subgraph_parallel_pass finished after 0.291 seconds +2025-08-07T13:52:25Z INFO 47306 [BackendPassManager]: curr_vmrss: 1489mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:25Z INFO 47306 [BackendPassManager]: Output has 1 module(s), 1 function(s), 27829 memory location(s), 1 block(s), and 277096 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:25Z USER 47306 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:52:25Z INFO 47306 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=27829 blocks=1 instructions=277096 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:25Z USER 47306 [ModuleForkPass]: Running birverifier +2025-08-07T13:52:25Z INFO 47306 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=27829 blocks=1 instructions=277096 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:25Z USER 47306 [ModuleForkPass]: birverifier finished after 0.259 seconds +2025-08-07T13:52:25Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1501mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:25Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27829 memory location(s), 1 block(s), and 277096 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:25Z USER 47306 [BackendPassManager]: mod_parallel_pass finished after 0.263 seconds +2025-08-07T13:52:25Z INFO 47306 [BackendPassManager]: curr_vmrss: 1501mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:25Z INFO 47306 [BackendPassManager]: Output has 1 module(s), 1 function(s), 27829 memory location(s), 1 block(s), and 277096 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:25Z USER 47306 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:52:25Z INFO 47306 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=27829 blocks=1 instructions=277096 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:25Z USER 47306 [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:52:25Z INFO 47306 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=27829 blocks=1 instructions=277096 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:25Z USER 47306 [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:52:25Z INFO 47306 [SubgraphForkPass]: curr_vmrss: 1501mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:25Z INFO 47306 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 27829 memory location(s), 1 block(s), and 277096 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:25Z USER 47306 [BackendPassManager]: subgraph_parallel_pass finished after 0.003 seconds +2025-08-07T13:52:25Z INFO 47306 [BackendPassManager]: curr_vmrss: 1501mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:25Z INFO 47306 [BackendPassManager]: Output has 1 module(s), 1 function(s), 27829 memory location(s), 1 block(s), and 277096 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:25Z USER 47306 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:52:25Z INFO 47306 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=27829 blocks=1 instructions=277096 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:25Z USER 47306 [ModuleForkPass]: Running instruction_reorder +2025-08-07T13:52:25Z INFO 47306 [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=27829 blocks=1 instructions=277096 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:25Z USER 47306 [ModuleForkPass]: instruction_reorder finished after 0.045 seconds +2025-08-07T13:52:25Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1501mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:25Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27829 memory location(s), 1 block(s), and 277096 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:25Z USER 47306 [ModuleForkPass]: Running psum_legalization +2025-08-07T13:52:25Z INFO 47306 [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=27829 blocks=1 instructions=277096 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:25Z USER 47306 [ModuleForkPass]: psum_legalization finished after 0.022 seconds +2025-08-07T13:52:25Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1501mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:25Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27829 memory location(s), 1 block(s), and 277096 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:25Z USER 47306 [ModuleForkPass]: Running legalize_cce_dma +2025-08-07T13:52:26Z INFO 47306 [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=27829 blocks=1 instructions=277096 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:26Z USER 47306 [ModuleForkPass]: legalize_cce_dma finished after 0.024 seconds +2025-08-07T13:52:26Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1501mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:26Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27829 memory location(s), 1 block(s), and 277096 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:26Z USER 47306 [ModuleForkPass]: Running error_injector +2025-08-07T13:52:26Z INFO 47306 [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=27829 blocks=1 instructions=277096 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:26Z WARNING 47306 [ErrorInjector]: Unrecognized injected error value "0" +2025-08-07T13:52:26Z USER 47306 [ModuleForkPass]: error_injector finished after 0.001 seconds +2025-08-07T13:52:26Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1501mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:26Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27829 memory location(s), 1 block(s), and 277096 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:26Z USER 47306 [ModuleForkPass]: Running vn_splitter +2025-08-07T13:52:26Z INFO 47306 [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=27829 blocks=1 instructions=277096 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:26Z INFO 47306 [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 14 +2025-08-07T13:52:26Z INFO 47306 [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-08-07T13:52:26Z INFO 47306 [ShrinkDN]: INFO (ShrinkDN): Shrunk 2 nodes. Total savings 14336 bytes/partition +2025-08-07T13:52:26Z INFO 47306 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-08-07T13:52:26Z INFO 47306 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-08-07T13:52:26Z INFO 47306 [VNSplitterPass]: INFO (VNSplitter) Time: 0.001 seconds +2025-08-07T13:52:26Z INFO 47306 [VNSplitterPass]: INFO (VerticalFusion) Time: 0.032 seconds +2025-08-07T13:52:26Z INFO 47306 [VNSplitterPass]: INFO (ShrinkDN) Time: 0.044 seconds +2025-08-07T13:52:26Z USER 47306 [ModuleForkPass]: vn_splitter finished after 0.123 seconds +2025-08-07T13:52:26Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1505mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:26Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27829 memory location(s), 1 block(s), and 277096 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:26Z USER 47306 [ModuleForkPass]: Running constant_propagate +2025-08-07T13:52:26Z INFO 47306 [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=27829 blocks=1 instructions=277096 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:26Z INFO 47306 [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-08-07T13:52:26Z INFO 47306 [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:52:26Z INFO 47306 [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:52:26Z INFO 47306 [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:52:26Z INFO 47306 [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:52:26Z INFO 47306 [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-08-07T13:52:26Z INFO 47306 [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:52:26Z INFO 47306 [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:52:26Z INFO 47306 [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:52:26Z INFO 47306 [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:52:26Z USER 47306 [ModuleForkPass]: constant_propagate finished after 0.610 seconds +2025-08-07T13:52:26Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1507mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:26Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27829 memory location(s), 1 block(s), and 277096 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:26Z USER 47306 [ModuleForkPass]: Running lower_ac +2025-08-07T13:52:26Z INFO 47306 [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=27829 blocks=1 instructions=277096 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:26Z INFO 47306 [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-08-07T13:52:26Z USER 47306 [ModuleForkPass]: lower_ac finished after 0.040 seconds +2025-08-07T13:52:26Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1507mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:26Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27829 memory location(s), 1 block(s), and 277096 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:26Z USER 47306 [ModuleForkPass]: Running input_dma_coalescing +2025-08-07T13:52:26Z INFO 47306 [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=27829 blocks=1 instructions=277096 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:26Z INFO 47306 [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-08-07T13:52:26Z USER 47306 [ModuleForkPass]: input_dma_coalescing finished after 0.077 seconds +2025-08-07T13:52:26Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1507mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:26Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27829 memory location(s), 1 block(s), and 277096 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:26Z USER 47306 [ModuleForkPass]: Running remat_optimization +2025-08-07T13:52:26Z INFO 47306 [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=27829 blocks=1 instructions=277096 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:27Z INFO 47306 [RematOpt]: Removed 0 remat instructions +2025-08-07T13:52:27Z USER 47306 [ModuleForkPass]: remat_optimization finished after 0.142 seconds +2025-08-07T13:52:27Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1509mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:27Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27829 memory location(s), 1 block(s), and 277096 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:27Z USER 47306 [ModuleForkPass]: Running early_peephole_opts +2025-08-07T13:52:27Z INFO 47306 [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=27829 blocks=1 instructions=277096 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:27Z INFO 47306 [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-08-07T13:52:27Z INFO 47306 [EarlyPeepholeOpts]: Activation Accumulate: 0 +2025-08-07T13:52:27Z USER 47306 [ModuleForkPass]: early_peephole_opts finished after 0.085 seconds +2025-08-07T13:52:27Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1509mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:27Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27829 memory location(s), 1 block(s), and 277096 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:27Z USER 47306 [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-08-07T13:52:27Z INFO 47306 [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=27829 blocks=1 instructions=277096 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:27Z USER 47306 [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.021 seconds +2025-08-07T13:52:27Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1509mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:27Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27829 memory location(s), 1 block(s), and 277096 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:27Z USER 47306 [ModuleForkPass]: Running infer_stream_ids +2025-08-07T13:52:27Z INFO 47306 [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=27829 blocks=1 instructions=277096 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:27Z USER 47306 [ModuleForkPass]: infer_stream_ids finished after 0.021 seconds +2025-08-07T13:52:27Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1509mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:27Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27829 memory location(s), 1 block(s), and 277096 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:27Z USER 47306 [ModuleForkPass]: Running pre_sched +2025-08-07T13:52:27Z INFO 47306 [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=27829 blocks=1 instructions=277096 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:27Z INFO 47306 [PreSched]: Start PRE scheduling 2 cores: 1 at: Thu Aug 7 13:52:27 2025 +2025-08-07T13:52:27Z INFO 47306 [LayerSpiller]: LayerSpill: Start... +2025-08-07T13:52:27Z INFO 47306 [LayerSpiller]: LayerSpill: Found 72 Splits CCs +2025-08-07T13:52:27Z INFO 47306 [LayerSpiller]: Grouped CCs to 72 clusters. +2025-08-07T13:52:27Z INFO 47306 [LayerSpiller]: LayerSpill: To Spill 60 multi-layer tensors +2025-08-07T13:52:27Z INFO 47306 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-08-07T13:52:27Z INFO 47306 [LayerSpiller]: LayerSpill: Done. +2025-08-07T13:52:27Z INFO 47306 [PreSched]: Start split live ranges Thu Aug 7 13:52:27 2025 +2025-08-07T13:52:27Z INFO 47306 [PreSched]: Num_Splits: 0 +2025-08-07T13:52:27Z INFO 47306 [PreSched]: End split live ranges Thu Aug 7 13:52:27 2025 +2025-08-07T13:52:27Z INFO 47306 [PreSched]: Strt remove redundncies Thu Aug 7 13:52:27 2025 +2025-08-07T13:52:27Z INFO 47306 [PreSched]: remove_redundant_memsets +2025-08-07T13:52:27Z INFO 47306 [PreSched]: remove_redundant_memsets: 0 +2025-08-07T13:52:27Z INFO 47306 [PreSched]: remove_redundant_loads +2025-08-07T13:52:27Z INFO 47306 [PreSched]: remove_redundant_loads: 0 +2025-08-07T13:52:27Z INFO 47306 [PreSched]: End remove redundncies Thu Aug 7 13:52:27 2025 +2025-08-07T13:52:27Z INFO 47306 [PreSched]: Start DCE Thu Aug 7 13:52:27 2025 +2025-08-07T13:52:27Z INFO 47306 [PreSched]: eliminateDeadStore removed 0 instructions +2025-08-07T13:52:27Z INFO 47306 [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:52:27Z INFO 47306 [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:52:27Z INFO 47306 [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:52:27Z INFO 47306 [PreSched]: End DCE Thu Aug 7 13:52:27 2025 +2025-08-07T13:52:27Z INFO 47306 [PreSched]: Start build flow dependencies Thu Aug 7 13:52:27 2025 +2025-08-07T13:52:27Z INFO 47306 [build_flow_deps]: Start build fdeps. Invocation: 1Thu Aug 7 13:52:27 2025 +2025-08-07T13:52:27Z INFO 47306 [build_flow_deps]: Allocs: 27949 instructions: 277216 +2025-08-07T13:52:28Z INFO 47306 [build_flow_deps]: Build fdeps inserted 816554 edges +2025-08-07T13:52:28Z INFO 47306 [build_flow_deps]: Done build fdeps 816554 Thu Aug 7 13:52:28 2025 +2025-08-07T13:52:28Z INFO 47306 [PreSched]: End build flow dependencies Thu Aug 7 13:52:28 2025 +2025-08-07T13:52:28Z INFO 47306 [PreSched]: Start remove useless insts Thu Aug 7 13:52:28 2025 +2025-08-07T13:52:28Z INFO 47306 [PreSched]: remove_useless_insts +2025-08-07T13:52:28Z INFO 47306 [PreSched]: remove Useless Instructions: 0 +2025-08-07T13:52:28Z INFO 47306 [PreSched]: End remove useless insts Thu Aug 7 13:52:28 2025 +2025-08-07T13:52:28Z INFO 47306 [PreSched]: Start scratchpad optimization Thu Aug 7 13:52:28 2025 +2025-08-07T13:52:28Z INFO 47306 [PreSched]: End scratchpad optimization Thu Aug 7 13:52:28 2025 +2025-08-07T13:52:28Z INFO 47306 [PreSched]: DONE PRE scheduling Thu Aug 7 13:52:28 2025 +2025-08-07T13:52:28Z USER 47306 [ModuleForkPass]: pre_sched finished after 1.701 seconds +2025-08-07T13:52:28Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1658mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:28Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27949 memory location(s), 1 block(s), and 277216 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:28Z USER 47306 [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:52:28Z INFO 47306 [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=27949 blocks=1 instructions=277216 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:28Z INFO 47306 [TensorCopyElim]: Tensor CP elimination: 1 +2025-08-07T13:52:29Z INFO 47306 [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:52:29Z INFO 47306 [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:52:29Z INFO 47306 [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:52:29Z INFO 47306 [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:52:29Z USER 47306 [ModuleForkPass]: tensor_copy_elim finished after 0.380 seconds +2025-08-07T13:52:29Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1658mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:29Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27948 memory location(s), 1 block(s), and 277215 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:29Z USER 47306 [ModuleForkPass]: Running dynamic_dma_setup +2025-08-07T13:52:29Z INFO 47306 [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=27948 blocks=1 instructions=277215 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:29Z USER 47306 [ModuleForkPass]: dynamic_dma_setup finished after 0.001 seconds +2025-08-07T13:52:29Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1658mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:29Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27949 memory location(s), 1 block(s), and 277215 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:29Z USER 47306 [ModuleForkPass]: Running runtime_memory_reservation +2025-08-07T13:52:29Z INFO 47306 [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=27949 blocks=1 instructions=277215 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:29Z USER 47306 [ModuleForkPass]: runtime_memory_reservation finished after 0.001 seconds +2025-08-07T13:52:29Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1658mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:29Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27949 memory location(s), 1 block(s), and 277215 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:29Z USER 47306 [ModuleForkPass]: Running coloring_allocator_psum +2025-08-07T13:52:29Z INFO 47306 [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=27949 blocks=1 instructions=277215 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:29Z INFO 47306 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:52:29Z INFO 47306 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:52:29Z INFO 47306 [PSUM_Allocator]: allocating PSUM +2025-08-07T13:52:29Z INFO 47306 [PSUM_Allocator]: main loop +2025-08-07T13:52:29Z INFO 47306 [PSUM_Allocator]: renumber locations +2025-08-07T13:52:29Z INFO 47306 [PSUM_Allocator]: size = 11741 +2025-08-07T13:52:29Z INFO 47306 [PSUM_Allocator]: build_no_bitmap start +2025-08-07T13:52:29Z INFO 47306 [PSUM_Allocator]: 100% PSUM demand before spilling +2025-08-07T13:52:29Z INFO 47306 [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-08-07T13:52:29Z INFO 47306 [PSUM_Allocator]: found 22033 edges +2025-08-07T13:52:29Z INFO 47306 [PSUM_Allocator]: mean: 3.75317 +2025-08-07T13:52:29Z INFO 47306 [PSUM_Allocator]: median: 2.28766 +2025-08-07T13:52:29Z INFO 47306 [PSUM_Allocator]: adjacency vectors require 176264 bytes +2025-08-07T13:52:29Z INFO 47306 [PSUM_Allocator]: build_no_bitmap done +2025-08-07T13:52:29Z INFO 47306 [PSUM_Allocator]: find costs +2025-08-07T13:52:34Z INFO 47306 [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-08-07T13:52:34Z INFO 47306 [PSUM_Allocator]: simplify interference graph +2025-08-07T13:52:34Z INFO 47306 [PSUM_Allocator]: initialize low and high +2025-08-07T13:52:34Z INFO 47306 [PSUM_Allocator]: lo = 11741 +2025-08-07T13:52:34Z INFO 47306 [PSUM_Allocator]: hi = 0 +2025-08-07T13:52:34Z INFO 47306 [PSUM_Allocator]: inf = 0 +2025-08-07T13:52:34Z INFO 47306 [PSUM_Allocator]: total = 11741 +2025-08-07T13:52:34Z INFO 47306 [PSUM_Allocator]: simplify +2025-08-07T13:52:34Z INFO 47306 [PSUM_Allocator]: new candidates = 0 +2025-08-07T13:52:34Z INFO 47306 [PSUM_Allocator]: select ranges +2025-08-07T13:52:34Z INFO 47306 [PSUM_Allocator]: no more spills +2025-08-07T13:52:34Z INFO 47306 [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-08-07T13:52:34Z INFO 47306 [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-08-07T13:52:34Z INFO 47306 [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-08-07T13:52:34Z USER 47306 [ModuleForkPass]: coloring_allocator_psum finished after 5.706 seconds +2025-08-07T13:52:34Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1661mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:34Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27949 memory location(s), 1 block(s), and 277215 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:34Z USER 47306 [ModuleForkPass]: Running dma_optimization_psum +2025-08-07T13:52:34Z INFO 47306 [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=27949 blocks=1 instructions=277215 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:35Z INFO 47306 [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-08-07T13:52:35Z INFO 47306 [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-08-07T13:52:35Z USER 47306 [ModuleForkPass]: dma_optimization_psum finished after 0.174 seconds +2025-08-07T13:52:35Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1661mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:35Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27949 memory location(s), 1 block(s), and 277215 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:35Z USER 47306 [ModuleForkPass]: Running address_rotation_psum +2025-08-07T13:52:35Z INFO 47306 [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=27949 blocks=1 instructions=277215 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:35Z INFO 47306 [DMAOptimizationBase]: PSUM Rotation rotated 832 PSUM Banks +2025-08-07T13:52:35Z INFO 47306 [DMAOptimizationBase]: PSUM Rotation rotated 202 PSUM Banks +2025-08-07T13:52:36Z INFO 47306 [DMAOptimizationBase]: PSUM Rotation rotated 335 PSUM Banks +2025-08-07T13:52:36Z USER 47306 [ModuleForkPass]: address_rotation_psum finished after 1.049 seconds +2025-08-07T13:52:36Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1667mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:52:36Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27949 memory location(s), 1 block(s), and 277215 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:36Z USER 47306 [ModuleForkPass]: Running coloring_allocator_sb +2025-08-07T13:52:36Z INFO 47306 [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=27949 blocks=1 instructions=277215 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:52:36Z INFO 47306 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 7583364294 +2025-08-07T13:52:36Z INFO 47306 [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 7312 bytes +2025-08-07T13:52:36Z INFO 47306 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 2812042 +2025-08-07T13:52:36Z INFO 47306 [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 397 bytes +2025-08-07T13:52:36Z INFO 47306 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 78980 +2025-08-07T13:52:36Z INFO 47306 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 136 bytes +2025-08-07T13:52:36Z INFO 47306 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:52:36Z INFO 47306 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: allocating SB +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: main loop +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: renumber locations +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: size = 15358 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: find partners +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: found 11522 accumulation groups +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: largest = _dot.9359-t36549 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: tensors = 49 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: requires 393280 bytes/partition +2025-08-07T13:52:36Z WARNING 47306 [SB_Allocator]: accumulation group is too large for SB +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: expanding partners +2025-08-07T13:52:36Z INFO 47306 []: find first defs for local +2025-08-07T13:52:36Z INFO 47306 []: find first defs for global +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: find loads +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: 1 pin count +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: 8233 remat count +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: build interference graph +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: pass 1 int-tree +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: Num intervals 15358 Num locations 15358 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: info.neighbors init Done +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: edge: 141619 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: mean: 18.4424 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: median: 10.3191 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: find costs +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: simplify interference graph +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: safe = 14854 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: unsafe = 326 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: inf = 177 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: total = 15357 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: simplify +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: simplify_step3_sorted2 #Unsafe 106 #Pinned 0 #Safe 0 minCost 0.00302294 maxCost 2.36906 locations 15358 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: new candidates = 8 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: select ranges +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: Total: 15357 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: Spilled: 0.000 (0) +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: Allocated: 1.000 (15357) +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: Rover zone: 0.958 (14715) +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: Pre-rover zone: 0.033 (512) +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: Post-rover zone: 0.008 (126) +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: Slice zone: 0.000 (4) +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: Blocks nothing: 0.041 (633) +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: Blocks medium: 0.008 (120) +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: Visited until medium blocking (mean): 0.444 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: Visited until medium blocking (median): 0.456 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: Visited until medium blocking (p95): 0.818 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: Blocks tall: 0.951 (14604) +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: Visited until tall blocking (mean): 0.894 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:52:36Z INFO 47306 [SB_Allocator]: Success +2025-08-07T13:53:01Z INFO 47306 [SB_Allocator]: SB spills = 0 tensors +2025-08-07T13:53:01Z INFO 47306 [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:53:01Z INFO 47306 [SB_Allocator]: remats = 0 tensors +2025-08-07T13:53:01Z INFO 47306 [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:53:01Z INFO 47306 [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:53:01Z INFO 47306 [SB_Allocator]: SB score = 0 +2025-08-07T13:53:01Z INFO 47306 [SB_Allocator]: spilling from SB cost about 0 cycles +2025-08-07T13:53:01Z INFO 47306 [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-08-07T13:53:01Z INFO 47306 [SB_Allocator]: pinning saved approximately 9010 cycles +2025-08-07T13:53:01Z INFO 47306 [SB_Allocator]: 0% SB utilization after allocation +2025-08-07T13:53:01Z INFO 47306 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 7583364294 +2025-08-07T13:53:01Z INFO 47306 [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 7312 bytes +2025-08-07T13:53:01Z INFO 47306 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 2812042 +2025-08-07T13:53:01Z INFO 47306 [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 397 bytes +2025-08-07T13:53:01Z INFO 47306 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 78980 +2025-08-07T13:53:01Z INFO 47306 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 136 bytes +2025-08-07T13:53:01Z USER 47306 [ModuleForkPass]: coloring_allocator_sb finished after 25.641 seconds +2025-08-07T13:53:01Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1675mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:01Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27949 memory location(s), 1 block(s), and 277215 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:01Z USER 47306 [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:53:01Z INFO 47306 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=27949 blocks=1 instructions=277215 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:02Z INFO 47306 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:53:02Z USER 47306 [ModuleForkPass]: address_rotation_sb finished after 0.360 seconds +2025-08-07T13:53:02Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1677mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:02Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27949 memory location(s), 1 block(s), and 277215 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:02Z USER 47306 [ModuleForkPass]: Running dma_optimization_sb +2025-08-07T13:53:02Z INFO 47306 [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=27949 blocks=1 instructions=277215 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:02Z INFO 47306 [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 7586176336, 99.9258% input load, 5.27275e-08% output write, 0.0742169% spill/reload [sg0000] +2025-08-07T13:53:02Z INFO 47306 [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:53:02Z INFO 47306 [DMAOptimizationBase]: removed 0 identical load +2025-08-07T13:53:02Z INFO 47306 [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:53:02Z INFO 47306 [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:53:02Z INFO 47306 [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-08-07T13:53:02Z INFO 47306 [DMAOptimizationBase]: [IO to internal DMACopy Insertion]: inserted 0 DMACopy instructions +2025-08-07T13:53:02Z INFO 47306 [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-08-07T13:53:02Z INFO 47306 [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-08-07T13:53:02Z INFO 47306 [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 168, 2.21455e-06% out of total dma traffic(7.58055e+09) +2025-08-07T13:53:02Z INFO 47306 [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload instructions +2025-08-07T13:53:02Z INFO 47306 [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload memory locations +2025-08-07T13:53:02Z INFO 47306 [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-08-07T13:53:02Z INFO 47306 [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-08-07T13:53:02Z INFO 47306 [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 4100, 0.0728213% out of total spill/reload dma traffic +2025-08-07T13:53:03Z INFO 47306 [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-08-07T13:53:03Z INFO 47306 [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-08-07T13:53:03Z INFO 47306 [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:53:03Z INFO 47306 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:53:03Z INFO 47306 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:53:03Z INFO 47306 [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:53:03Z INFO 47306 [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-08-07T13:53:03Z INFO 47306 [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-08-07T13:53:03Z INFO 47306 [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-08-07T13:53:03Z INFO 47306 [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-08-07T13:53:03Z INFO 47306 [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 116 SpillSaves and Reloads +2025-08-07T13:53:03Z INFO 47306 [DMAOptimizationBase]: average loaded DMA size 7326 bytes +2025-08-07T13:53:03Z INFO 47306 [DMAOptimizationBase]: average saved DMA size 539 bytes +2025-08-07T13:53:03Z INFO 47306 [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 56 SpillSaves and Reloads +2025-08-07T13:53:03Z INFO 47306 [DMAOptimizationBase]: average loaded DMA size 7332 bytes +2025-08-07T13:53:03Z INFO 47306 [DMAOptimizationBase]: average saved DMA size 650 bytes +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: DMA SpillSave Coalescing Round 2 combined 0 SpillSaves and Reloads +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: average loaded DMA size 7332 bytes +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: average saved DMA size 650 bytes +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 7583362076 +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 7332 bytes +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 2809992 +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 650 bytes +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 4268, 5.62602e-05% out of total dma traffic +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 7586172068, 99.9258% input load, 5.27275e-08% output write, 0.0741629% spill/reload [sg0000] +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 7583362076 +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 7332 bytes +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 2809992 +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 650 bytes +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 78980 +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 136 bytes +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 7300 bytes +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-08-07T13:53:04Z USER 47306 [ModuleForkPass]: dma_optimization_sb finished after 2.243 seconds +2025-08-07T13:53:04Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1712mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:04Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277081 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:04Z USER 47306 [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:53:04Z INFO 47306 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=27771 blocks=1 instructions=277081 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: SB Rotation rotated 359 Sb address +2025-08-07T13:53:04Z INFO 47306 [DMAOptimizationBase]: SB Rotation rotated 4462 Sb address +2025-08-07T13:53:05Z INFO 47306 [DMAOptimizationBase]: SB Rotation rotated 856 Sb address +2025-08-07T13:53:05Z INFO 47306 [DMAOptimizationBase]: SB Rotation rotated 445 Sb address +2025-08-07T13:53:05Z INFO 47306 [DMAOptimizationBase]: SB Rotation rotated 1976 Sb address +2025-08-07T13:53:06Z INFO 47306 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:53:06Z USER 47306 [ModuleForkPass]: address_rotation_sb finished after 1.694 seconds +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1712mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277081 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:06Z USER 47306 [ModuleForkPass]: Running coloring_allocator_dram +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=27771 blocks=1 instructions=277081 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:06Z INFO 47306 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:53:06Z INFO 47306 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: reserved space = 8344433440 bytes +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: spill space = 3420292 bytes +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: aligned spill space = 3469312 bytes +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: renumber locations +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: size = 178 +2025-08-07T13:53:06Z INFO 47306 []: find first defs for local +2025-08-07T13:53:06Z INFO 47306 []: find first defs for global +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: Num intervals 178 Num locations 178 +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: simplify interference graph +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: initialize low and high +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: lo = 178 +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: hi = 0 +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: total = 178 +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: simplify +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: select ranges +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: allreduce_dram_hwm 1208320 +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: Real CC buffer size 1208320 +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: DRAM hwm after allocation: 3117056 +2025-08-07T13:53:06Z INFO 47306 [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:53:06Z USER 47306 [ModuleForkPass]: coloring_allocator_dram finished after 0.381 seconds +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1715mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277081 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:06Z USER 47306 [ModuleForkPass]: Running address_rotation_dram +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=27771 blocks=1 instructions=277081 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:06Z INFO 47306 [DMAOptimizationBase]: Runtime page size at 512MB +2025-08-07T13:53:06Z INFO 47306 [DMAOptimizationBase]: DRAM hwm before rotation 3117056 +2025-08-07T13:53:06Z INFO 47306 [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-08-07T13:53:06Z INFO 47306 [DMAOptimizationBase]: allreduce hwm 1208320 +2025-08-07T13:53:06Z INFO 47306 [DMAOptimizationBase]: Real CC buffer size 1208320 +2025-08-07T13:53:06Z INFO 47306 [DMAOptimizationBase]: DRAM hwm after rotation 3117056 +2025-08-07T13:53:06Z INFO 47306 [DMAOptimizationBase]: DRAM Rotation rotated 9 Dram address +2025-08-07T13:53:06Z USER 47306 [ModuleForkPass]: address_rotation_dram finished after 0.186 seconds +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1717mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277081 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:06Z USER 47306 [ModuleForkPass]: Running tensorcopy_accel +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=27771 blocks=1 instructions=277081 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:06Z INFO 47306 [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-08-07T13:53:06Z INFO 47306 [TensorCopyAccel::Impl]: Accelerated 72 out of 11862 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-08-07T13:53:06Z USER 47306 [ModuleForkPass]: tensorcopy_accel finished after 0.024 seconds +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1717mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277081 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:06Z USER 47306 [ModuleForkPass]: Running peephole_opts +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=27771 blocks=1 instructions=277081 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:06Z INFO 47306 [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-08-07T13:53:06Z USER 47306 [ModuleForkPass]: peephole_opts finished after 0.100 seconds +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1717mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277119 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:06Z USER 47306 [ModuleForkPass]: Running lower_kernel +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=27771 blocks=1 instructions=277119 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:06Z INFO 47306 [LowerKernel]: Started running LowerKernel +2025-08-07T13:53:06Z INFO 47306 [LowerKernel]: Start of kernel lowering pass, number of insts: 277119, number of allocs: 27771 +2025-08-07T13:53:06Z INFO 47306 [LowerKernel]: Scan BKs time (s): 0.019819 +2025-08-07T13:53:06Z INFO 47306 [LowerKernel]: Lower BKs time (s): 6e-06 +2025-08-07T13:53:06Z USER 47306 [ModuleForkPass]: lower_kernel finished after 0.022 seconds +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1717mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277119 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:06Z USER 47306 [ModuleForkPass]: Running lower_nki_kernel +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=27771 blocks=1 instructions=277119 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:06Z USER 47306 [ModuleForkPass]: lower_nki_kernel finished after 0.021 seconds +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1717mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277119 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:06Z USER 47306 [ModuleForkPass]: Running dynamic_dma_cleanup +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=27771 blocks=1 instructions=277119 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:06Z USER 47306 [ModuleForkPass]: dynamic_dma_cleanup finished after 0.032 seconds +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1719mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277119 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:06Z USER 47306 [ModuleForkPass]: Running birverifier +2025-08-07T13:53:06Z INFO 47306 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=27771 blocks=1 instructions=277119 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:07Z USER 47306 [ModuleForkPass]: birverifier finished after 0.203 seconds +2025-08-07T13:53:07Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1719mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:07Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277119 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:07Z USER 47306 [ModuleForkPass]: Running dynamic_dma_scan +2025-08-07T13:53:07Z INFO 47306 [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=27771 blocks=1 instructions=277119 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:07Z USER 47306 [ModuleForkPass]: dynamic_dma_scan finished after 0.032 seconds +2025-08-07T13:53:07Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1719mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:07Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277119 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:07Z USER 47306 [ModuleForkPass]: Running build_fdeps +2025-08-07T13:53:07Z INFO 47306 [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=27771 blocks=1 instructions=277119 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:07Z INFO 47306 [build_flow_deps]: Start build fdeps. Invocation: 2Thu Aug 7 13:53:07 2025 +2025-08-07T13:53:07Z INFO 47306 [build_flow_deps]: Allocs: 27771 instructions: 277119 +2025-08-07T13:53:07Z INFO 47306 [build_flow_deps]: Build fdeps inserted 816543 edges +2025-08-07T13:53:07Z INFO 47306 [build_flow_deps]: Done build fdeps 816543 Thu Aug 7 13:53:07 2025 +2025-08-07T13:53:07Z USER 47306 [ModuleForkPass]: build_fdeps finished after 0.601 seconds +2025-08-07T13:53:07Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1731mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:07Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277119 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:07Z USER 47306 [ModuleForkPass]: Running remove_redundancies +2025-08-07T13:53:07Z INFO 47306 [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=27771 blocks=1 instructions=277119 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:07Z INFO 47306 [RemoveRedundancies]: remove_clobbered_writes +2025-08-07T13:53:07Z INFO 47306 [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-08-07T13:53:07Z INFO 47306 [RemoveRedundancies]: remove_useless_insts +2025-08-07T13:53:07Z INFO 47306 [RemoveRedundancies]: remove Useless Instructions: 0 +2025-08-07T13:53:07Z USER 47306 [ModuleForkPass]: remove_redundancies finished after 0.092 seconds +2025-08-07T13:53:07Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1731mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:07Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277119 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:07Z USER 47306 [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:53:07Z INFO 47306 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=27771 blocks=1 instructions=277119 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:07Z INFO 47306 [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:53:07Z INFO 47306 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:53:07Z INFO 47306 [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:53:09Z USER 47306 [ModuleForkPass]: anti_dependency_analyzer finished after 1.184 seconds +2025-08-07T13:53:09Z INFO 47306 [ModuleForkPass]: curr_vmrss: 2167mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:09Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277119 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:09Z USER 47306 [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:53:09Z INFO 47306 [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=27771 blocks=1 instructions=277119 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:09Z INFO 47306 [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:53:09Z INFO 47306 [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:53:09Z USER 47306 [ModuleForkPass]: tensor_copy_elim finished after 0.266 seconds +2025-08-07T13:53:09Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1829mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:09Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277119 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:09Z USER 47306 [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-08-07T13:53:09Z INFO 47306 [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=27771 blocks=1 instructions=277119 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:09Z USER 47306 [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.001 seconds +2025-08-07T13:53:09Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1829mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:09Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277119 instruction(s). Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:09Z USER 47306 [ModuleForkPass]: Running post_sched +2025-08-07T13:53:09Z INFO 47306 [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=27771 blocks=1 instructions=277119 Max writers: 1536 Max Readers: 20035 +2025-08-07T13:53:09Z INFO 47306 [post_scheduler]: Start PosT ScheD 3 sunda Thu Aug 7 13:53:09 2025 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.395-t35739 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.727-t35769 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.1059-t35799 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.1391-t35829 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.1723-t35859 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.2055-t35889 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.2387-t35919 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.2719-t35949 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.3051-t35979 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.3383-t36009 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.3715-t36039 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.4047-t36069 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.4379-t36099 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.4711-t36129 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.5043-t36159 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.5375-t36189 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.5707-t36219 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.6039-t36249 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.6371-t36279 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.6703-t36309 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.7035-t36339 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.7367-t36369 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.7699-t36399 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.8031-t36429 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.8363-t36459 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.8695-t36489 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.9027-t36519 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.9359-t36549 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.9691-t36579 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.10023-t36609 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.10355-t36639 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.10687-t36669 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.11019-t36699 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.11351-t36729 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.11683-t36759 +2025-08-07T13:53:09Z WARNING 47306 [post_scheduler]: Inserted memset 0 for _dot.12015-t36789 +2025-08-07T13:53:21Z INFO 47306 [post_scheduler]: Time-aware hwm post-sched +2025-08-07T13:53:27Z INFO 47306 [post_scheduler]: Time-aware simulation time: 34821961 +2025-08-07T13:53:28Z INFO 47306 [post_scheduler]: Done PosT ScheD Thu Aug 7 13:53:28 2025 +2025-08-07T13:53:28Z USER 47306 [ModuleForkPass]: post_sched finished after 19.659 seconds +2025-08-07T13:53:28Z INFO 47306 [ModuleForkPass]: curr_vmrss: 2327mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:29Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:29Z USER 47306 [ModuleForkPass]: Running expand_scheduling_units +2025-08-07T13:53:29Z INFO 47306 [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:29Z USER 47306 [ModuleForkPass]: expand_scheduling_units finished after 0.029 seconds +2025-08-07T13:53:29Z INFO 47306 [ModuleForkPass]: curr_vmrss: 2211mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:29Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:29Z USER 47306 [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:53:29Z INFO 47306 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:32Z INFO 47306 [DMAOptimizationBase]: PSUM Rotation rotated 6277 PSUM Banks +2025-08-07T13:53:32Z INFO 47306 [DMAOptimizationBase]: PSUM Rotation rotated 7380 PSUM Banks +2025-08-07T13:53:33Z INFO 47306 [DMAOptimizationBase]: PSUM Rotation rotated 305 PSUM Banks +2025-08-07T13:53:33Z INFO 47306 [DMAOptimizationBase]: SB Rotation rotated 420 Sb address +2025-08-07T13:53:34Z INFO 47306 [DMAOptimizationBase]: SB Rotation rotated 4686 Sb address +2025-08-07T13:53:34Z INFO 47306 [DMAOptimizationBase]: SB Rotation rotated 389 Sb address +2025-08-07T13:53:34Z INFO 47306 [DMAOptimizationBase]: SB Rotation rotated 210 Sb address +2025-08-07T13:53:34Z INFO 47306 [DMAOptimizationBase]: SB Rotation rotated 183 Sb address +2025-08-07T13:53:35Z INFO 47306 [DMAOptimizationBase]: moved 0 MM forward +2025-08-07T13:53:35Z INFO 47306 [DMAOptimizationBase]: SB Rotation rotated 7 Sb address +2025-08-07T13:53:35Z INFO 47306 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:53:35Z USER 47306 [ModuleForkPass]: address_rotation_sb finished after 6.667 seconds +2025-08-07T13:53:35Z INFO 47306 [ModuleForkPass]: curr_vmrss: 2230mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:35Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:35Z USER 47306 [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:53:35Z INFO 47306 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:35Z INFO 47306 [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:53:35Z INFO 47306 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:53:35Z INFO 47306 [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:53:36Z USER 47306 [ModuleForkPass]: anti_dependency_analyzer finished after 1.115 seconds +2025-08-07T13:53:36Z INFO 47306 [ModuleForkPass]: curr_vmrss: 2340mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:36Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:36Z USER 47306 [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:53:36Z INFO 47306 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:36Z INFO 47306 [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:53:36Z INFO 47306 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-08-07T13:53:36Z INFO 47306 [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:53:37Z USER 47306 [ModuleForkPass]: anti_dependency_analyzer finished after 0.170 seconds +2025-08-07T13:53:37Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1979mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:37Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:37Z USER 47306 [ModuleForkPass]: Running dep_opt +2025-08-07T13:53:37Z INFO 47306 [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:37Z INFO 47306 [build_flow_deps]: Start build fdeps. Invocation: 3Thu Aug 7 13:53:37 2025 +2025-08-07T13:53:37Z INFO 47306 [build_flow_deps]: Allocs: 27771 instructions: 277155 +2025-08-07T13:53:37Z INFO 47306 [build_flow_deps]: Build fdeps inserted 806528 edges +2025-08-07T13:53:37Z INFO 47306 [build_flow_deps]: Done build fdeps 806528 Thu Aug 7 13:53:37 2025 +2025-08-07T13:53:38Z USER 47306 [ModuleForkPass]: dep_opt finished after 1.062 seconds +2025-08-07T13:53:38Z INFO 47306 [ModuleForkPass]: curr_vmrss: 2007mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:38Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [ModuleForkPass]: Running report_stats +2025-08-07T13:53:38Z INFO 47306 [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z INFO 47306 [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 622329856 │ +│ DMACopy │ Internal │ 1 │ 24576 │ +│ DMACopy │ Internal -> ExternalOutput │ 72 │ 75497472 │ +│ Load │ Const -> Internal │ 77 │ 2394376 │ +│ Load │ ExternalInput -> Internal │ 8047 │ 7578151564 │ +│ Load │ Internal │ 107 │ 2816136 │ +│ Save │ Internal │ 695 │ 2809988 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴────────────┘ + +2025-08-07T13:53:38Z INFO 47306 [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 1 │ 2 │ +│ 2 │ 72 │ +│ 4 │ 45 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 64 │ 73 │ +│ 256 │ 147 │ +│ 512 │ 665 │ +│ 1024 │ 88 │ +│ 2048 │ 30 │ +│ 4096 │ 2 │ +│ 6144 │ 2304 │ +│ 8192 │ 5493 │ +│ 60768 │ 1 │ +│ 60776 │ 4 │ +│ 262144 │ 72 │ +└─────────────────────┴───────┘ + +2025-08-07T13:53:38Z INFO 47306 [ReportStats]: MM Stats: #MatMults 251660 #MatMult-Transposes 20035 +2025-08-07T13:53:38Z INFO 47306 [ReportStats]: IO Tensor size combined: 8342039064 +2025-08-07T13:53:38Z INFO 47306 [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input473 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input76 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input85 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input106 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input96 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input84 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input98 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input109 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input107 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input95 │ ExternalInput │ bfloat16 │ 50331648 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-08-07T13:53:38Z INFO 47306 [ReportStats]: Large (Internal) Tensor Statistics: +┌────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input83_local_32950_i3 │ Internal │ bfloat16 │ 1048576 │ +│ -t62398 │ Internal │ float32 │ 1048576 │ +│ input83_local_32950_i1 │ Internal │ bfloat16 │ 1048576 │ +│ -t62392 │ Internal │ float32 │ 1048576 │ +│ input83_local_32950_i2 │ Internal │ bfloat16 │ 1048576 │ +│ -t62387 │ Internal │ float32 │ 1048576 │ +│ input83_local_32950_i5 │ Internal │ bfloat16 │ 1048576 │ +│ input83_local_32950_i4 │ Internal │ bfloat16 │ 1048576 │ +│ input83_local_32950_i0 │ Internal │ bfloat16 │ 1048576 │ +└────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-08-07T13:53:38Z USER 47306 [ModuleForkPass]: report_stats finished after 0.060 seconds +2025-08-07T13:53:38Z INFO 47306 [ModuleForkPass]: curr_vmrss: 2007mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:38Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [BackendPassManager]: mod_parallel_pass finished after 72.255 seconds +2025-08-07T13:53:38Z INFO 47306 [BackendPassManager]: curr_vmrss: 2007mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:38Z INFO 47306 [BackendPassManager]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [BackendPassManager]: Running assign_trigger_engine +2025-08-07T13:53:38Z INFO 47306 [BackendPassManager]: Inputs to assign_trigger_engine: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z INFO 47306 [AssignTriggerEngine]: Assigned trigger engine for 771 DMA instructions. Moved 76 DMA instructions to CC's engines. +2025-08-07T13:53:38Z USER 47306 [BackendPassManager]: assign_trigger_engine finished after 0.098 seconds +2025-08-07T13:53:38Z INFO 47306 [BackendPassManager]: curr_vmrss: 2008mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:38Z INFO 47306 [BackendPassManager]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:53:38Z INFO 47306 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [SubgraphForkPass]: Running lower_local_collectives +2025-08-07T13:53:38Z INFO 47306 [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [SubgraphForkPass]: lower_local_collectives finished after 0.001 seconds +2025-08-07T13:53:38Z INFO 47306 [SubgraphForkPass]: curr_vmrss: 2008mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:38Z INFO 47306 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [SubgraphForkPass]: Running extend_shared_lifetimes +2025-08-07T13:53:38Z INFO 47306 [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [SubgraphForkPass]: extend_shared_lifetimes finished after 0.001 seconds +2025-08-07T13:53:38Z INFO 47306 [SubgraphForkPass]: curr_vmrss: 2008mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:38Z INFO 47306 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:53:38Z INFO 47306 [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z INFO 47306 [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:53:38Z USER 47306 [SubgraphForkPass]: dead_code_elim finished after 0.169 seconds +2025-08-07T13:53:38Z INFO 47306 [SubgraphForkPass]: curr_vmrss: 2009mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:38Z INFO 47306 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [BackendPassManager]: subgraph_parallel_pass finished after 0.177 seconds +2025-08-07T13:53:38Z INFO 47306 [BackendPassManager]: curr_vmrss: 2009mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:38Z INFO 47306 [BackendPassManager]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [BackendPassManager]: Running assign_hwdge_engine +2025-08-07T13:53:38Z INFO 47306 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [BackendPassManager]: assign_hwdge_engine finished after 0.029 seconds +2025-08-07T13:53:38Z INFO 47306 [BackendPassManager]: curr_vmrss: 2009mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:38Z INFO 47306 [BackendPassManager]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:53:38Z INFO 47306 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [ModuleForkPass]: Running alloc_queues +2025-08-07T13:53:38Z INFO 47306 [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z INFO 47306 [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-08-07T13:53:38Z INFO 47306 [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 35 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 109 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 95 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 671 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 5 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 8085 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-08-07T13:53:38Z USER 47306 [ModuleForkPass]: alloc_queues finished after 0.030 seconds +2025-08-07T13:53:38Z INFO 47306 [ModuleForkPass]: curr_vmrss: 2009mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:38Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [ModuleForkPass]: Running chain_dma_transposes +2025-08-07T13:53:38Z INFO 47306 [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [ModuleForkPass]: chain_dma_transposes finished after 0.001 seconds +2025-08-07T13:53:38Z INFO 47306 [ModuleForkPass]: curr_vmrss: 2009mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:38Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-08-07T13:53:38Z INFO 47306 [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.001 seconds +2025-08-07T13:53:38Z INFO 47306 [ModuleForkPass]: curr_vmrss: 2009mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:38Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [ModuleForkPass]: Running lower_control +2025-08-07T13:53:38Z INFO 47306 [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z INFO 47306 [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-08-07T13:53:38Z USER 47306 [ModuleForkPass]: lower_control finished after 0.337 seconds +2025-08-07T13:53:38Z INFO 47306 [ModuleForkPass]: curr_vmrss: 2009mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:38Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [BackendPassManager]: mod_parallel_pass finished after 0.377 seconds +2025-08-07T13:53:38Z INFO 47306 [BackendPassManager]: curr_vmrss: 2009mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:38Z INFO 47306 [BackendPassManager]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [BackendPassManager]: Running nc_parallel_pass +2025-08-07T13:53:38Z INFO 47306 [BackendPassManager]: Inputs to nc_parallel_pass: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z USER 47306 [CoreForkPass]: Running dep_reduction +2025-08-07T13:53:38Z INFO 47306 [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:38Z INFO 47306 [DepReduction]: Start Dependency Reduction +2025-08-07T13:53:39Z INFO 47306 [DepReduction]: Processing async instrs... +2025-08-07T13:53:39Z INFO 47306 [DepReduction]: Processing secondary edges per engine... +2025-08-07T13:53:39Z INFO 47306 [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 253054 +2025-08-07T13:53:39Z INFO 47306 [DepReduction]: Processing redundant descendants, Done. Num edges removed 262102 +2025-08-07T13:53:39Z INFO 47306 [DepReduction]: Processing async instrs, Done. Num edges removed 262102 +2025-08-07T13:53:42Z INFO 47306 [DepReduction]: Num Async removed: 0 +2025-08-07T13:53:42Z INFO 47306 [DepReduction]: Finished dependency reduction: 1848725 removed, new total 38899 +2025-08-07T13:53:42Z INFO 47306 [DepReduction]: Finished Dependency Reduction +2025-08-07T13:53:42Z USER 47306 [CoreForkPass]: dep_reduction finished after 3.228 seconds +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: curr_vmrss: 2254mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:42Z USER 47306 [CoreForkPass]: Running lower_dynamic_dma +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:42Z USER 47306 [CoreForkPass]: lower_dynamic_dma finished after 0.044 seconds +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: curr_vmrss: 2228mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:42Z USER 47306 [CoreForkPass]: Running legalize_dynamic_dma +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:42Z INFO 47306 [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-08-07T13:53:42Z INFO 47306 [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-08-07T13:53:42Z INFO 47306 [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-08-07T13:53:42Z USER 47306 [CoreForkPass]: legalize_dynamic_dma finished after 0.108 seconds +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: curr_vmrss: 2228mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277155 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:42Z USER 47306 [CoreForkPass]: Running lower_dma +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=27771 blocks=1 instructions=277155 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:42Z INFO 47306 [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 7939/7939 (100% DGE) + power-of-2 partition : 7940/7976 (99.5486% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 7940/7976 (99.5486% DGE) + Cast (DGE/DMA) + 128 partition : 72/72 (100% DGE) + power-of-2 partition : 72/72 (100% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 72/72 (100% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 0/8 (0% DGE) + power-of-2 partition : 0/879 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/879 (0% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 1 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 72/72 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-08-07T13:53:42Z USER 47306 [CoreForkPass]: lower_dma finished after 0.140 seconds +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: curr_vmrss: 2228mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277157 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:42Z USER 47306 [CoreForkPass]: Running coalesce_dma_blocks +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: Inputs to coalesce_dma_blocks: modules=1 functions=1 allocs=27771 blocks=1 instructions=277157 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:42Z INFO 47306 [CoalesceDmaBlocks]: Coaleseced 50 DMA triggers +2025-08-07T13:53:42Z USER 47306 [CoreForkPass]: coalesce_dma_blocks finished after 0.111 seconds +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: curr_vmrss: 2232mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277107 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:42Z USER 47306 [CoreForkPass]: Running expand_all_engine +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=27771 blocks=1 instructions=277107 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:42Z USER 47306 [CoreForkPass]: expand_all_engine finished after 0.041 seconds +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: curr_vmrss: 2227mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277107 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:42Z USER 47306 [CoreForkPass]: Running alloc_semaphores +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=27771 blocks=1 instructions=277107 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:42Z USER 47306 [CoreForkPass]: alloc_semaphores finished after 0.332 seconds +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: curr_vmrss: 2227mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277107 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:42Z USER 47306 [CoreForkPass]: Running expand_inst_late +2025-08-07T13:53:42Z INFO 47306 [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=27771 blocks=1 instructions=277107 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:43Z USER 47306 [CoreForkPass]: expand_inst_late finished after 0.395 seconds +2025-08-07T13:53:43Z INFO 47306 [CoreForkPass]: curr_vmrss: 2227mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:43Z INFO 47306 [CoreForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277182 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:43Z USER 47306 [CoreForkPass]: Running seq_inst_opt +2025-08-07T13:53:43Z INFO 47306 [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=27771 blocks=1 instructions=277182 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:43Z INFO 47306 [SeqInstOpt]: Removing 71 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:53:43Z USER 47306 [CoreForkPass]: seq_inst_opt finished after 0.031 seconds +2025-08-07T13:53:43Z INFO 47306 [CoreForkPass]: curr_vmrss: 2227mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:43Z INFO 47306 [CoreForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 277111 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:43Z USER 47306 [CoreForkPass]: Running lower_sync +2025-08-07T13:53:43Z INFO 47306 [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=27771 blocks=1 instructions=277111 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:43Z USER 47306 [CoreForkPass]: lower_sync finished after 0.086 seconds +2025-08-07T13:53:43Z INFO 47306 [CoreForkPass]: curr_vmrss: 2234mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:43Z INFO 47306 [CoreForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 285858 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:43Z USER 47306 [CoreForkPass]: Running lower_act +2025-08-07T13:53:43Z INFO 47306 [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=27771 blocks=1 instructions=285858 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:43Z USER 47306 [CoreForkPass]: lower_act finished after 0.030 seconds +2025-08-07T13:53:43Z INFO 47306 [CoreForkPass]: curr_vmrss: 2234mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:43Z INFO 47306 [CoreForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 286091 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:43Z USER 47306 [CoreForkPass]: Running lower_dve +2025-08-07T13:53:43Z INFO 47306 [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=27771 blocks=1 instructions=286091 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:43Z INFO 47306 [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json +2025-08-07T13:53:43Z USER 47306 [CoreForkPass]: lower_dve finished after 0.332 seconds +2025-08-07T13:53:43Z INFO 47306 [CoreForkPass]: curr_vmrss: 2278mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:43Z INFO 47306 [CoreForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 286091 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:43Z USER 47306 [CoreForkPass]: Running lower_ap +2025-08-07T13:53:43Z INFO 47306 [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=27771 blocks=1 instructions=286091 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:43Z USER 47306 [CoreForkPass]: lower_ap finished after 0.046 seconds +2025-08-07T13:53:43Z INFO 47306 [CoreForkPass]: curr_vmrss: 2236mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:43Z INFO 47306 [CoreForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 286091 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:43Z USER 47306 [CoreForkPass]: Running coloring_allocator_reg +2025-08-07T13:53:43Z INFO 47306 [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=27771 blocks=1 instructions=286091 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:43Z INFO 47306 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:53:43Z INFO 47306 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:53:43Z INFO 47306 [REG_Allocator]: allocating REG +2025-08-07T13:53:43Z INFO 47306 [REG_Allocator]: main loop iteration 1 +2025-08-07T13:53:43Z INFO 47306 [REG_Allocator]: renumber registers +2025-08-07T13:53:43Z INFO 47306 [REG_Allocator]: size = 5 +2025-08-07T13:53:43Z INFO 47306 []: find first defs for local reg +2025-08-07T13:53:43Z INFO 47306 []: find first defs for global reg +2025-08-07T13:53:44Z INFO 47306 [REG_Allocator]: live range analysis +2025-08-07T13:53:44Z INFO 47306 [REG_Allocator]: find costs +2025-08-07T13:53:44Z INFO 47306 [REG_Allocator]: simplify interference graph +2025-08-07T13:53:44Z INFO 47306 [REG_Allocator]: initialize low and high +2025-08-07T13:53:44Z INFO 47306 [REG_Allocator]: lo = 5 +2025-08-07T13:53:44Z INFO 47306 [REG_Allocator]: hi = 0 +2025-08-07T13:53:44Z INFO 47306 [REG_Allocator]: inf = 0 +2025-08-07T13:53:44Z INFO 47306 [REG_Allocator]: total = 5 +2025-08-07T13:53:44Z INFO 47306 [REG_Allocator]: simplify +2025-08-07T13:53:44Z INFO 47306 [REG_Allocator]: new candidates = 0 +2025-08-07T13:53:44Z INFO 47306 [REG_Allocator]: select ranges +2025-08-07T13:53:44Z INFO 47306 [REG_Allocator]: no more spills +2025-08-07T13:53:44Z INFO 47306 [REG_Allocator]: REG score = 0 (lower is better) +2025-08-07T13:53:44Z INFO 47306 [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-08-07T13:53:44Z INFO 47306 [REG_Allocator]: 0% REG utilization after allocation +2025-08-07T13:53:44Z USER 47306 [CoreForkPass]: coloring_allocator_reg finished after 0.389 seconds +2025-08-07T13:53:44Z INFO 47306 [CoreForkPass]: curr_vmrss: 2281mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:44Z INFO 47306 [CoreForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 286091 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:44Z USER 47306 [BackendPassManager]: nc_parallel_pass finished after 5.655 seconds +2025-08-07T13:53:44Z INFO 47306 [BackendPassManager]: curr_vmrss: 2236mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:44Z INFO 47306 [BackendPassManager]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 286091 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:44Z USER 47306 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:53:44Z INFO 47306 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=27771 blocks=1 instructions=286091 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:44Z USER 47306 [ModuleForkPass]: Running birverifier +2025-08-07T13:53:44Z INFO 47306 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=27771 blocks=1 instructions=286091 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:44Z USER 47306 [ModuleForkPass]: birverifier finished after 0.226 seconds +2025-08-07T13:53:44Z INFO 47306 [ModuleForkPass]: curr_vmrss: 1983mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:44Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 286091 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:44Z USER 47306 [BackendPassManager]: mod_parallel_pass finished after 0.232 seconds +2025-08-07T13:53:44Z INFO 47306 [BackendPassManager]: curr_vmrss: 1983mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:44Z INFO 47306 [BackendPassManager]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 286091 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:44Z USER 47306 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:53:44Z INFO 47306 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=27771 blocks=1 instructions=286091 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:44Z USER 47306 [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:53:44Z INFO 47306 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=27771 blocks=1 instructions=286091 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:44Z USER 47306 [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:53:44Z INFO 47306 [SubgraphForkPass]: curr_vmrss: 1983mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:44Z INFO 47306 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 286091 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:44Z USER 47306 [BackendPassManager]: subgraph_parallel_pass finished after 0.004 seconds +2025-08-07T13:53:44Z INFO 47306 [BackendPassManager]: curr_vmrss: 1983mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:44Z INFO 47306 [BackendPassManager]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 286091 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:44Z USER 47306 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:53:44Z INFO 47306 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=27771 blocks=1 instructions=286091 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:44Z USER 47306 [ModuleForkPass]: Running codegen +2025-08-07T13:53:44Z INFO 47306 [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=27771 blocks=1 instructions=286091 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:44Z INFO 47306 [Codegen]: Total compiler allocated DRAM tensors: 0.00290298 GB +2025-08-07T13:53:44Z INFO 47306 [Codegen]: Total un-allocated DRAM tensors by kind: +2025-08-07T13:53:44Z INFO 47306 [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 7.69882 │ +│ ExternalOutput │ 3.72529e-09 │ +│ Const │ 0.00222994 │ +└────────────────┴─────────────┘ + +2025-08-07T13:53:44Z INFO 47306 [Codegen]: Total runtime managed DRAM tensors: 7.70105 GB +2025-08-07T13:53:45Z INFO 47306 [Codegen]: Instruction Stats: +2025-08-07T13:53:45Z INFO 47306 [Codegen]: +┌─────────────────────┬────────┐ +│ Opcode │ Count │ +├─────────────────────┼────────┤ +│ MATMUL │ 251660 │ +│ LDWEIGHTS │ 251660 │ +│ ACTIVATE │ 12647 │ +│ EVENT_SEMAPHORE │ 8747 │ +│ UNKNOWN(0xd4) │ 8085 │ +│ TENSOR_TENSOR │ 1125 │ +│ PSEUDO_DMA_TRIGGER │ 866 │ +│ MATCH_VALUE_LOAD │ 441 │ +│ TENSOR_SCALAR_ADDR │ 345 │ +│ MEMSET │ 333 │ +│ TENSOR_SCALAR │ 332 │ +│ LOAD_MASK_SELECT │ 294 │ +│ ACT_TABLE_LOAD │ 233 │ +│ CAST │ 230 │ +│ MAX8 │ 224 │ +│ FIND_INDEX8 │ 224 │ +│ STREAM_SHUFFLE │ 222 │ +│ MATCH_REPLACE8 │ 217 │ +│ TENSOR_REDUCE │ 151 │ +│ UNKNOWN(0xda) │ 148 │ +│ GATHER │ 99 │ +│ POOL_BUFFER_LOAD │ 99 │ +│ RECIPROCAL │ 75 │ +│ UNKNOWN(0xd9) │ 75 │ +│ IOTA │ 73 │ +│ STREAM_TRANSPOSE │ 72 │ +│ COPY │ 72 │ +│ UNKNOWN(0xe8) │ 38 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ ALU_OP │ 2 │ +│ UNKNOWN(0xe5) │ 2 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +│ MOVE │ 1 │ +│ NOP │ 1 │ +│ RNG │ 1 │ +│ TENSOR_SCALAR │ 1 │ +└─────────────────────┴────────┘ + +2025-08-07T13:53:45Z INFO 47306 [Codegen]: +┌────────────┬────────┐ +│ Engine │ Count │ +├────────────┼────────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 13440 │ +│ Scalar │ 14578 │ +│ Tensor │ 506363 │ +│ SyncDMA │ 0 │ +│ Vector │ 4280 │ +│ Sync │ 145 │ +│ All │ 0 │ +└────────────┴────────┘ + +2025-08-07T13:53:45Z INFO 47306 [Codegen]: Total instructions: 538806 (0.0321153 GB) +2025-08-07T13:53:45Z INFO 47306 [Codegen]: Total DynamicDMA instruction count: 8085 +2025-08-07T13:53:45Z USER 47306 [Codegen]: isa_gen finished after 1.162 seconds +2025-08-07T13:53:45Z INFO 47306 [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────┼────────────────┤ +│ qActSpillReload0 │ 5932 │ +│ qDVESpillReload0 │ 138 │ +│ qPoolIO0 │ 2 │ +│ qPoolSpillReload0 │ 7308 │ +│ qSPIO0 │ 70 │ +│ qSPSpillReload0 │ 12384 │ +└───────────────────┴────────────────┘ + +Total descriptors: 25834 (0.000384957 GB) +2025-08-07T13:53:45Z INFO 47306 [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolDynamic │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +│ qPoolIO0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 112 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-08-07T13:53:45Z INFO 47306 [Codegen]: Tensors with largest descriptor count: +┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ Coalesced_memloc_Coalesced_memloc_cosine.152.50071--cosine.152.50067_24--Coalesced_memloc_cosine.152.50058--cosine.152.50054_27_99 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.152.50175--cosine.152.50171_0--Coalesced_memloc_cosine.152.50162--cosine.152.50158_3_87 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.152.50149--cosine.152.50145_6--Coalesced_memloc_cosine.152.50136--cosine.152.50132_9_90 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.152.49993--cosine.152.49989_42--Coalesced_memloc_cosine.152.49980--cosine.152.49976_45_108 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.152.49889--cosine.152.49885_66--Coalesced_memloc_cosine.152.49876--cosine.152.49872_69_120 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.152.49863--cosine.152.49859_72--Coalesced_memloc_cosine.152.49850--cosine.152.49846_75_123 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.152.50045--cosine.152.50041_30--Coalesced_memloc_cosine.152.50032--cosine.152.50028_33_102 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.152.49915--cosine.152.49911_60--Coalesced_memloc_cosine.152.49902--cosine.152.49898_63_117 │ Internal │ float32 │ 5 │ +│ input2 │ ExternalInput │ int32 │ 31 │ +│ convert.840 │ Internal │ float32 │ 599 │ +└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-08-07T13:53:45Z USER 47306 [Codegen]: dma_desc_gen finished after 0.014 seconds +2025-08-07T13:53:45Z INFO 47306 [Codegen]: Estimated peak DRAM usage: 7.73645 GB +2025-08-07T13:53:45Z INFO 47306 [Codegen]: Generating debug info +2025-08-07T13:53:46Z WARNING 47306 [Codegen]: Found 163 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-08-07T13:53:46Z USER 47306 [Codegen]: debug_info_gen finished after 0.550 seconds +2025-08-07T13:53:46Z USER 47306 [ModuleForkPass]: codegen finished after 1.778 seconds +2025-08-07T13:53:46Z INFO 47306 [ModuleForkPass]: curr_vmrss: 2216mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:46Z INFO 47306 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 286091 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:46Z USER 47306 [BackendPassManager]: mod_parallel_pass finished after 1.803 seconds +2025-08-07T13:53:46Z INFO 47306 [BackendPassManager]: curr_vmrss: 2022mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:46Z INFO 47306 [BackendPassManager]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 286091 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:46Z USER 47306 [BackendPassManager]: Running neff_packager +2025-08-07T13:53:46Z INFO 47306 [BackendPassManager]: Inputs to neff_packager: modules=1 functions=1 allocs=27771 blocks=1 instructions=286091 Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:46Z WARNING 47306 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.169490.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-08-07T13:53:46Z INFO 47306 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.neff +2025-08-07T13:53:46Z INFO 47306 [NeffFileWriter]: IR signature: b10f509ebdeafba6769739af0b92c8e2 for neff artifacts +2025-08-07T13:53:46Z USER 47306 [BackendPassManager]: neff_packager finished after 0.311 seconds +2025-08-07T13:53:46Z INFO 47306 [BackendPassManager]: curr_vmrss: 2022mb, ru_maxrss: 2379mb (delta=0mb) +2025-08-07T13:53:46Z INFO 47306 [BackendPassManager]: Output has 1 module(s), 1 function(s), 27771 memory location(s), 1 block(s), and 286091 instruction(s). Max writers: 1537 Max Readers: 20035 +2025-08-07T13:53:46Z INFO 47306 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ module │ Peak scratchpad usage: local │ 0.002903 GB │ +│ nc00 │ module │ Total size of allocated tensors: local │ 0.003231 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.002903 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.000000 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.000000 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.002903 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-08-07T13:53:46Z INFO 47306 [BackendDriver]: Backend completed successfully, tearing down. +2025-08-07T13:53:47Z INFO 47058 [job.WalrusDriver.0]: Job #0 finished +2025-08-07T13:53:47Z INFO 47058 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-08-07T13:53:47Z INFO 47058 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-08-07T13:53:47Z INFO 47058 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/home/ubuntu/qwen3/token_generation_model/_tp0_bk0/neuronxcc-ykq_7n9z/sg00", "state_id": "sg00"}' --pipeline BIRLinker +2025-08-07T13:53:47Z INFO 47058 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/token_generation_model/_tp0_bk0/neuronxcc-ykq_7n9z +2025-08-07T13:53:47Z INFO 47058 [job.BIRLinker.0]: Linking not needed. Netlist doesnt exist +2025-08-07T13:53:47Z INFO 47058 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-08-07T13:53:47Z INFO 47058 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-08-07T13:53:47Z INFO 47058 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-08-07T13:53:47Z INFO 47058 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-08-07T13:53:47Z INFO 47058 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-08-07T13:53:47Z INFO 47058 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-08-07T13:53:47Z INFO 47058 [job.NeffWrapper.0]: Processing input #0 +2025-08-07T13:53:47Z INFO 47058 [job.NeffWrapper.0]: Start NeffWrapper +2025-08-07T13:53:47Z INFO 47058 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.hlo_module.pb --neff /home/ubuntu/qwen3/token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.neff --io_transposes /home/ubuntu/qwen3/token_generation_model/_tp0_bk0/neuronxcc-ykq_7n9z/io_transposes.json --output /home/ubuntu/qwen3/token_generation_model/_tp0_bk0/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/token_generation_model/_tp0_bk0/neuronxcc-ykq_7n9z/hlo_netlist.json +2025-08-07T13:53:48Z INFO 47058 [job.NeffWrapper.0]: Could not open file: /home/ubuntu/qwen3/token_generation_model/_tp0_bk0/neuronxcc-ykq_7n9z/hlo_netlist.json +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-08-07T13:53:48Z INFO 47058 [job.NeffWrapper.0]: Job #0 finished +2025-08-07T13:53:48Z INFO 47058 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-08-07T13:53:48Z INFO 47058 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-08-07T13:53:48Z INFO 47058 [pipeline.Pipeline.0]: Job #0 finished +2025-08-07T13:53:48Z INFO 46994 [root]: Subcommand returned with exitcode=0 diff --git a/token_generation_model/_tp0_bk0/metaneff.pb b/token_generation_model/_tp0_bk0/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..0b2f785503adfa7fb83f01e84b116815b68c377f --- /dev/null +++ b/token_generation_model/_tp0_bk0/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f42b279a662fc21e6bb94ab8bdb96ad553535cec385b6c8909a4e7622fad939 +size 985283 diff --git a/token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.hlo_module.pb b/token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..c3c8b6caa83113e55955fe5a88765020b5b6dc03 --- /dev/null +++ b/token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b84de1c7109b93d3bf677f50a6adfce9d88aab86c7f512a7234c08cd856732f +size 957497 diff --git a/token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.neff b/token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.neff new file mode 100644 index 0000000000000000000000000000000000000000..972c91a0d0d805310ba7367f6a8614353443f945 --- /dev/null +++ b/token_generation_model/_tp0_bk0/model.MODULE_6ef5ba8b41fbbe77f080+74ae8282.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82be447a0a308a6e83990d1f3d193b4dc43ab835b136e7c27647ecf6cde94383 +size 6001664 diff --git a/token_generation_model/_tp0_bk0/neuron_config.json b/token_generation_model/_tp0_bk0/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..56d1105fe562a2504b36fbf28f9348f4b53919cd --- /dev/null +++ b/token_generation_model/_tp0_bk0/neuron_config.json @@ -0,0 +1,220 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "Qwen/Qwen3-8B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 12288, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 36, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 1, + "bucket_n_active_tokens": false, + "buckets": [ + 128 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 1, + "chunked_prefill_config": null, + "context_encoding_buckets": null, + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": false, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 1, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": false, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 1, + "lora_config": null, + "max_batch_size": 1, + "max_context_length": 1024, + "max_length": 1024, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 1, + "n_positions": 1024, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 1024, + "pa_num_blocks": 1, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 1024, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 1, + "speculation_length": 0, + "start_rank_id": 0, + "target": null, + "tile_cc": false, + "tkg_batch_size": 1, + "token_generation_buckets": [ + 128 + ], + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 32, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/token_generation_model/_tp0_bk0/wrapped_neff.hlo b/token_generation_model/_tp0_bk0/wrapped_neff.hlo new file mode 100644 index 0000000000000000000000000000000000000000..2f734cdd974efbcca63f97732d6f5d79be1d98a8 --- /dev/null +++ b/token_generation_model/_tp0_bk0/wrapped_neff.hlo @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5250286af3a576703a7879fff0513d8b034e9e082d5f6e86399bcceb6b7b164c +size 6206032 diff --git a/token_generation_model/_tp0_bk1/command.txt b/token_generation_model/_tp0_bk1/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..6929ded5aec062d47be7630abd4c7b08d7971e39 --- /dev/null +++ b/token_generation_model/_tp0_bk1/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_d608453625db6ed38994+e5eecdd4.hlo_module.pb --output model.MODULE_d608453625db6ed38994+e5eecdd4.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ' --lnc=1 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/token_generation_model/_tp0_bk1/compile_flags.MODULE_d608453625db6ed38994+e5eecdd4.json b/token_generation_model/_tp0_bk1/compile_flags.MODULE_d608453625db6ed38994+e5eecdd4.json new file mode 100644 index 0000000000000000000000000000000000000000..98ac3880757053a1b98e9e982d379eec3a2650f0 --- /dev/null +++ b/token_generation_model/_tp0_bk1/compile_flags.MODULE_d608453625db6ed38994+e5eecdd4.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ", "--lnc=1", "-O2", "--internal-hlo2tensorizer-options=--verify-hlo=true", "--logfile=/home/ubuntu/qwen3/token_generation_model/_tp0_bk1/log-neuron-cc.txt"] \ No newline at end of file diff --git a/token_generation_model/_tp0_bk1/global_metric_store.json b/token_generation_model/_tp0_bk1/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..74a16f31099426df83cd9b8286ed84ca73392a61 --- /dev/null +++ b/token_generation_model/_tp0_bk1/global_metric_store.json @@ -0,0 +1,540 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 99.8356704711914, + "StaticProfiler::AveragePartitionUtilization": 99.36558532714844, + "StaticProfiler::AveragePeUtilization": 99.6281509399414, + "StaticProfiler::LocalizationEfficiency": 109.8439712524414, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 109.93106842041016, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1, + "StaticProfiler::AveragePartitionUtilization": 1, + "StaticProfiler::AveragePeUtilization": 1, + "StaticProfiler::LocalizationEfficiency": 1, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 2.232550621032715, + "AffinePredicateResolution": 0.05623936653137207, + "AliasDependencyElimination": 0.002832651138305664, + "AliasDependencyInduction": 0.44058680534362793, + "AliasDependencyReset": 0.4666476249694824, + "BFComputeCutting": 0.0580902099609375, + "BirCodeGenLoop": 2.594275712966919, + "CCOpFusion": 0.4991264343261719, + "CanonicalizeConv": 1.9999999949504854e-06, + "CanonicalizeDAGForPGTiling": 0.22878766059875488, + "CanonicalizeForTensorizer": 0.00043799998820759356, + "CanonicalizeIR": 0.07420587539672852, + "Canonicalizer": 0.008046000264585018, + "CoalesceCCOp": 0.19905948638916016, + "CommuteConcat": 0.03734135627746582, + "DMALocalityOpt": 0.03657102584838867, + "DMAProfiler": 0.09372735023498535, + "DMATilingProfiler": 0.07789874076843262, + "DataLocalityOpt": 2.0418925285339355, + "DataStreaming": 0.16410613059997559, + "DeConcat": 0.013718366622924805, + "DeadCodeElimination": 0.040288686752319336, + "DeadStoreElimination": 0.41424059867858887, + "DelinearIndices": 0.35643458366394043, + "Delinearization": 0.2430880069732666, + "DoNothing": 0.0002276897430419922, + "DramToDramTranspose": 1.1447904109954834, + "DumpGraphAndMetadata": 0.25116658210754395, + "EliminateDivs": 0.18507814407348633, + "ExpandBatchNorm": 0.0696108341217041, + "ExpandISAMacro": 0.09687161445617676, + "FactorizeBlkDims": 0.2835962772369385, + "FactorizeThreadAxesInFreeDims": 0.041979074478149414, + "FlattenMacroLoop": 0.08133101463317871, + "GenericAccessSimplifier": 0.03695535659790039, + "HoistCompute": 6.800000119255856e-05, + "IdentifyCrossPassTensors": 0.00047900000936351717, + "InferInitValue": 1.1282598972320557, + "InferIntrinsicOnCC": 0.4870164394378662, + "InferNeuronTensor": 1.9024591445922852, + "InferNonlocalTensors": 4.48793888092041, + "InferPSumTensor": 1.0733928680419922, + "InlineNativeKernels": 0.05753469467163086, + "InsertIOTransposes": 1.0351307392120361, + "InsertLocalTransposes": 1.1141972541809082, + "InsertOffloadedTransposes": 0.09859800338745117, + "LICM": 0.11556148529052734, + "LateLegalizeInst": 0.23787212371826172, + "LateLegalizePostSplit": 0.10073041915893555, + "LateLowerReshapeOp": 0.04807281494140625, + "LateLowerTensorOp": 0.39286017417907715, + "LateNeuronInstComb": 0.48323988914489746, + "LayoutPreprocessing": 1.1271135807037354, + "LayoutPreprocessingAndAnalysis": 1.5124428272247314, + "LayoutRequirementAnalysis": 0.3717648983001709, + "LegalizeCCOpLayout": 0.0850982666015625, + "LegalizeOpLevelAlias": 0.040421247482299805, + "LegalizePartitionReduce": 0.03851008415222168, + "LegalizeSundaAccess": 1.4664347171783447, + "LegalizeSundaMacro": 0.41884398460388184, + "LegalizeType": 0.1985793113708496, + "LocalLayoutOpt": 0.41052842140197754, + "LoopFusion": 0.3379099369049072, + "LoopSplitting": 0.014409780502319336, + "LowerBroadcast": 0.052779197692871094, + "LowerCCOpBlockAxis": 0.24830293655395508, + "LowerComplexBroadcast": 0.16336774826049805, + "LowerIntrinsics": 1.4745566844940186, + "LowerTensorOp": 0.5229439735412598, + "LowerTranspose": 0.4257504940032959, + "MacroGeneration": 2.4529645442962646, + "MaskPropagation": 0.15470170974731445, + "MemcastMotion": 0.00020900000527035445, + "MemcpyElimination": 5.041346549987793, + "MutateDataType": 0.058200836181640625, + "NeuronAliasDependencyInduction": 0.027556180953979492, + "NeuronAliasDependencyReset": 0.03658699989318848, + "NeuronInstComb": 0.20778894424438477, + "NeuronLICM": 0.29681825637817383, + "NeuronLoopFusion": 0.47371411323547363, + "NeuronLoopInterchange": 0.0506441593170166, + "NeuronSimplifier": 0.33747220039367676, + "NeuronSimplifyPredicates": 0.18614435195922852, + "NeuronValueNumbering": 0.12105250358581543, + "OptimizeAliasedCopyChain": 0.017556190490722656, + "OptimizeNKIKernels": 0.4447648525238037, + "PAGLayoutOpt": 30.352853775024414, + "PComputeCutting": 0.2897770404815674, + "PGLayoutTilingPipeline": 44.85792541503906, + "PGTiling": 5.324963092803955, + "PadElimination": 0.009048938751220703, + "ParAxesAnnotation": 29.229726791381836, + "PartialLoopFusion": 0.317737340927124, + "PartialSimdFusion": 0.2749216556549072, + "PenguinizeFunctions": 0.00023200000578071922, + "PerfectLoopNest": 0.06647539138793945, + "PruneFunctions": 0.000311999989207834, + "RecognizeOpIdiom": 0.21295762062072754, + "Recompute": 0.010057210922241211, + "RelaxPredicates": 0.16659164428710938, + "Rematerialization": 0.16863059997558594, + "RemoveOptimizationBarriers": 0.0006970000104047358, + "ReshapeWeights": 0.023304462432861328, + "ResolveAccessConflict": 0.269362211227417, + "ResolveComplicatePredicates": 0.05520820617675781, + "RewriteReplicationMatmul": 0.05014157295227051, + "RewriteWeights": 0.06727242469787598, + "SFKVectorizer": 3.9295907020568848, + "ScatterMotion": 0.0037159998901188374, + "SimpleAllReduceTiling": 0.07012629508972168, + "Simplifier": 0.13543295860290527, + "SimplifyMacroPredicates": 0.20038676261901855, + "SimplifyNeuronTensor": 1.4079937934875488, + "SimplifySlice": 0.03729248046875, + "SimplifyTensor": 0.23219513893127441, + "SpillPSum": 0.37714505195617676, + "SplitAPUnionSets": 0.3479146957397461, + "SplitAccGrp": 0.04291415214538574, + "StaticProfiler": 0.1440873146057129, + "StaticTransposeLocalTensor": 0.25237345695495605, + "SundaISel": 1.9206316471099854, + "TCTransform": 0.03933572769165039, + "TensorInitialization": 0.14406371116638184, + "TensorOpSimplifier": 0.4716155529022217, + "TensorOpTransform": 1.603982925415039, + "TensorizerLegalizationPass": 0.00018699999782256782, + "TileCCOps": 0.23507094383239746, + "TilingProfiler": 0.4180028438568115, + "TransformConvOp": 0.07676315307617188, + "TritiumFusion": 1.224381923675537, + "ValueNumbering": 0.10977578163146973, + "VectorizeDMA": 0.03783679008483887, + "VectorizeMatMult": 0.023741960525512695, + "VerifySupportedOps": 0.0002699999895412475, + "WeightCoalescing": 0.0594639778137207, + "ZeroSizeTensorElimination": 0.0004744529724121094, + "algsimp": 0.0026970000471919775, + "batchnorm_expander": 0.0008980000275187194, + "boundary-marker-removal": 0.00048499999684281647, + "call-inliner": 0.0004889999981969595, + "canonicalize-boundary-marker": 0.0005579999997280538, + "collective-stream-id-checker": 7.000000186963007e-05, + "comparison-expander": 0.0005319999763742089, + "computation-deduplicator": 0.0005189999938011169, + "conditional-to-select": 0.00016399999731220305, + "config-lowering": 0.00040499999886378646, + "constant_folding": 0.00029799999902024865, + "cse": 0.0006779999821446836, + "dce": 7.200000254670158e-05, + "dynamic-slice-transpose": 0.00026199998683296144, + "eliminate-redundant-compare": 0.00028700000257231295, + "emit-offloaded-dropout": 0.00041700000292621553, + "flatten-call-graph": 0.00039500001003034413, + "fuse-send-recv": 0.002090000081807375, + "hilo::LegalizeAlias": 0.0038960000965744257, + "hilo::NeuronInstCombine": 0.0013500000350177288, + "hilo::NeuronOpFusion": 0.0006019999855197966, + "hilo::ReplaceTokenTypeWithU8Pass": 0.00044999999227002263, + "hilo::ScheduleFusion": 4.099999932805076e-05, + "hilo::SixtyFourHack": 0.000291000003926456, + "hilo::VerifyAliasing": 8.70000003487803e-05, + "hlo-mac-count": 0.0011050000321120024, + "hlo-verifier": 0.0074269999749958515, + "io-con-pipe-begin": 7.000000096013537e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0013160000089555979, + "legalize-ccops": 3.7000001611886546e-05, + "legalize-compare": 0.0004619999963324517, + "lower-argminmax-custom-call": 0.00025400001322850585, + "map-inline": 0.0007459999760612845, + "metadata-naming": 0.0012679999927058816, + "mlir::detail::OpToOpPassAdaptor": 0.00024399999529123306, + "mlir::hlo::MhloToPyPenguin": 0.03399299830198288, + "mlir::mhlo::LowerComplexExtraPass": 0.0032820000778883696, + "mlir::mhlo::LowerComplexPass": 0.0028880001045763493, + "native-to-custom-softmax": 0.0005740000051446259, + "native-to-custom-softmax-dx": 0.0005530000198632479, + "operand_upcaster": 0.0010339999571442604, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.06707599759101868, + "pre-hlo-begin": 4.999999873689376e-06, + "pre-hlo-end": 9.999999974752427e-07, + "replace-minimum-constant": 0.00038899999344721437, + "reshape-mover": 0.0001250000059371814, + "simplify-concat": 0.0025239998940378428, + "simplify-while-loops": 0.00010599999950500205, + "transform-variadic-reduce": 0.0007849999819882214, + "tuple-simplifier": 0.0003000000142492354, + "unpack-nested-aws-ntwsr": 0.0004900000058114529, + "unroll-while-loop": 2.4000000848900527e-05 + }, + "hilo": { + "HloMacCount": 3821871104.0, + "Traffic": 8267154944.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 263005, + "StaticProfiler::AifUb": 10.638744354248047, + "StaticProfiler::ArithmeticIntensityTensorizer": 11.686018943786621, + "StaticProfiler::AverageDmaLength": 6504.53759765625, + "StaticProfiler::DDRTransferBytes": 7596639576, + "StaticProfiler::InternalTransferBytes": 637633556, + "StaticProfiler::LoadExpanded": 1061057, + "StaticProfiler::StoreExpanded": 3422, + "StaticProfiler::TotalDMAExpanded": 1064479, + "StaticProfiler::TotalDynamicInstancesCount": 276347, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 275718, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 79, + "TilingProfiler::MatMultInstructionsAfterTiling": 231696, + "TilingProfiler::NumPfTransposes": 398, + "TilingProfiler::NumPfTransposesForIo": 37, + "TilingProfiler::NumPfTransposesForLocal": 216, + "TilingProfiler::NumPfTransposesForNonlocal": 145, + "TilingProfiler::PfTransposeInstructions": 19729, + "TilingProfiler::PfTransposeInstructionsForIo": 19296, + "TilingProfiler::PfTransposeInstructionsForLocal": 288, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 145, + "TilingProfiler::ReduceInstructionsAfterTiling": 74, + "TilingProfiler::SimdInstructionsAfterTiling": 3035, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + }, + "all": { + "compiletime": { + "CanonicalizeConv": 1.9999999949504854e-06, + "CanonicalizeForTensorizer": 0.00043799998820759356, + "Canonicalizer": 0.008046000264585018, + "HoistCompute": 6.800000119255856e-05, + "IdentifyCrossPassTensors": 0.00047900000936351717, + "MemcastMotion": 0.00020900000527035445, + "PenguinizeFunctions": 0.00023200000578071922, + "PruneFunctions": 0.000311999989207834, + "RemoveOptimizationBarriers": 0.0006970000104047358, + "ScatterMotion": 0.0037159998901188374, + "TensorizerLegalizationPass": 0.00018699999782256782, + "VerifySupportedOps": 0.0002699999895412475, + "algsimp": 0.0026970000471919775, + "batchnorm_expander": 0.0008980000275187194, + "boundary-marker-removal": 0.00048499999684281647, + "call-inliner": 0.0004889999981969595, + "canonicalize-boundary-marker": 0.0005579999997280538, + "collective-stream-id-checker": 7.000000186963007e-05, + "comparison-expander": 0.0005319999763742089, + "computation-deduplicator": 0.0005189999938011169, + "conditional-to-select": 0.00016399999731220305, + "config-lowering": 0.00040499999886378646, + "constant_folding": 0.00029799999902024865, + "cse": 0.0006779999821446836, + "dce": 7.200000254670158e-05, + "dynamic-slice-transpose": 0.00026199998683296144, + "eliminate-redundant-compare": 0.00028700000257231295, + "emit-offloaded-dropout": 0.00041700000292621553, + "flatten-call-graph": 0.00039500001003034413, + "fuse-send-recv": 0.002090000081807375, + "hilo::LegalizeAlias": 0.0038960000965744257, + "hilo::NeuronInstCombine": 0.0013500000350177288, + "hilo::NeuronOpFusion": 0.0006019999855197966, + "hilo::ReplaceTokenTypeWithU8Pass": 0.00044999999227002263, + "hilo::ScheduleFusion": 4.099999932805076e-05, + "hilo::SixtyFourHack": 0.000291000003926456, + "hilo::VerifyAliasing": 8.70000003487803e-05, + "hlo-mac-count": 0.0011050000321120024, + "hlo-verifier": 0.0074269999749958515, + "io-con-pipe-begin": 7.000000096013537e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0013160000089555979, + "legalize-ccops": 3.7000001611886546e-05, + "legalize-compare": 0.0004619999963324517, + "lower-argminmax-custom-call": 0.00025400001322850585, + "map-inline": 0.0007459999760612845, + "metadata-naming": 0.0012679999927058816, + "mlir::detail::OpToOpPassAdaptor": 0.00024399999529123306, + "mlir::hlo::MhloToPyPenguin": 0.03399299830198288, + "mlir::mhlo::LowerComplexExtraPass": 0.0032820000778883696, + "mlir::mhlo::LowerComplexPass": 0.0028880001045763493, + "native-to-custom-softmax": 0.0005740000051446259, + "native-to-custom-softmax-dx": 0.0005530000198632479, + "operand_upcaster": 0.0010339999571442604, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.06707599759101868, + "pre-hlo-begin": 4.999999873689376e-06, + "pre-hlo-end": 9.999999974752427e-07, + "replace-minimum-constant": 0.00038899999344721437, + "reshape-mover": 0.0001250000059371814, + "simplify-concat": 0.0025239998940378428, + "simplify-while-loops": 0.00010599999950500205, + "transform-variadic-reduce": 0.0007849999819882214, + "tuple-simplifier": 0.0003000000142492354, + "unpack-nested-aws-ntwsr": 0.0004900000058114529, + "unroll-while-loop": 2.4000000848900527e-05 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.0002086162567138672, + "DMALocalityOpt": 0.00018668174743652344, + "DMAProfiler": 0.0007734298706054688, + "DataStreaming": 0.00029754638671875, + "DoNothing": 0.0001506805419921875, + "ExpandISAMacro": 0.0005266666412353516, + "FactorizeBlkDims": 0.0004870891571044922, + "InferPSumTensor": 0.0004894733428955078, + "LateLegalizeInst": 0.00040268898010253906, + "LateNeuronInstComb": 0.0005068778991699219, + "LegalizeSundaAccess": 0.001726388931274414, + "LegalizeType": 0.00026416778564453125, + "LowerBroadcast": 0.00021910667419433594, + "LowerIntrinsics": 0.0002162456512451172, + "LowerTranspose": 0.0002193450927734375, + "NeuronInstComb": 0.0004906654357910156, + "NeuronLICM": 0.0004401206970214844, + "NeuronSimplifyPredicates": 0.0029528141021728516, + "NeuronValueNumbering": 0.0004203319549560547, + "SFKVectorizer": 0.0029082298278808594, + "SimpleAllReduceTiling": 0.00019216537475585938, + "SimplifyNeuronTensor": 0.0004115104675292969, + "SpillPSum": 0.0005426406860351563, + "WeightCoalescing": 0.0002117156982421875 + } + }, + "sg00": { + "hilo": { + "ArithmeticIntensity": 0.924591600894928, + "HloMacCount": 3821871104.0, + "Traffic": 8267154944.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 2.232550621032715, + "AffinePredicateResolution": 0.05623936653137207, + "AliasDependencyElimination": 0.002832651138305664, + "AliasDependencyInduction": 0.44058680534362793, + "AliasDependencyReset": 0.4666476249694824, + "BFComputeCutting": 0.0580902099609375, + "BirCodeGenLoop": 2.594275712966919, + "CCOpFusion": 0.4991264343261719, + "CanonicalizeDAGForPGTiling": 0.22878766059875488, + "CanonicalizeIR": 0.07420587539672852, + "CoalesceCCOp": 0.1988508701324463, + "CommuteConcat": 0.03734135627746582, + "DMALocalityOpt": 0.03638434410095215, + "DMAProfiler": 0.09295392036437988, + "DMATilingProfiler": 0.07789874076843262, + "DataLocalityOpt": 2.0418925285339355, + "DataStreaming": 0.16380858421325684, + "DeConcat": 0.013718366622924805, + "DeadCodeElimination": 0.040288686752319336, + "DeadStoreElimination": 0.41424059867858887, + "DelinearIndices": 0.35643458366394043, + "Delinearization": 0.2430880069732666, + "DoNothing": 7.700920104980469e-05, + "DramToDramTranspose": 1.1447904109954834, + "DumpGraphAndMetadata": 0.25116658210754395, + "EliminateDivs": 0.18507814407348633, + "ExpandBatchNorm": 0.0696108341217041, + "ExpandISAMacro": 0.0963449478149414, + "FactorizeBlkDims": 0.283109188079834, + "FactorizeThreadAxesInFreeDims": 0.041979074478149414, + "FlattenMacroLoop": 0.08133101463317871, + "GenericAccessSimplifier": 0.03695535659790039, + "InferInitValue": 1.1282598972320557, + "InferIntrinsicOnCC": 0.4870164394378662, + "InferNeuronTensor": 1.9024591445922852, + "InferNonlocalTensors": 4.48793888092041, + "InferPSumTensor": 1.0729033946990967, + "InlineNativeKernels": 0.05753469467163086, + "InsertIOTransposes": 1.0351307392120361, + "InsertLocalTransposes": 1.1141972541809082, + "InsertOffloadedTransposes": 0.09859800338745117, + "LICM": 0.11556148529052734, + "LateLegalizeInst": 0.23746943473815918, + "LateLegalizePostSplit": 0.10073041915893555, + "LateLowerReshapeOp": 0.04807281494140625, + "LateLowerTensorOp": 0.39286017417907715, + "LateNeuronInstComb": 0.48273301124572754, + "LayoutPreprocessing": 1.1271135807037354, + "LayoutPreprocessingAndAnalysis": 1.5124428272247314, + "LayoutRequirementAnalysis": 0.3717648983001709, + "LegalizeCCOpLayout": 0.0850982666015625, + "LegalizeOpLevelAlias": 0.040421247482299805, + "LegalizePartitionReduce": 0.03851008415222168, + "LegalizeSundaAccess": 1.4647083282470703, + "LegalizeSundaMacro": 0.41884398460388184, + "LegalizeType": 0.19831514358520508, + "LocalLayoutOpt": 0.41052842140197754, + "LoopFusion": 0.3379099369049072, + "LoopSplitting": 0.014409780502319336, + "LowerBroadcast": 0.05256009101867676, + "LowerCCOpBlockAxis": 0.24830293655395508, + "LowerComplexBroadcast": 0.16336774826049805, + "LowerIntrinsics": 1.4743404388427734, + "LowerTensorOp": 0.5229439735412598, + "LowerTranspose": 0.42553114891052246, + "MacroGeneration": 2.4529645442962646, + "MaskPropagation": 0.15470170974731445, + "MemcpyElimination": 5.041346549987793, + "MutateDataType": 0.058200836181640625, + "NeuronAliasDependencyInduction": 0.027556180953979492, + "NeuronAliasDependencyReset": 0.03658699989318848, + "NeuronInstComb": 0.20729827880859375, + "NeuronLICM": 0.29637813568115234, + "NeuronLoopFusion": 0.47371411323547363, + "NeuronLoopInterchange": 0.0506441593170166, + "NeuronSimplifier": 0.33747220039367676, + "NeuronSimplifyPredicates": 0.18319153785705566, + "NeuronValueNumbering": 0.12063217163085938, + "OptimizeAliasedCopyChain": 0.017556190490722656, + "OptimizeNKIKernels": 0.4447648525238037, + "PAGLayoutOpt": 30.352853775024414, + "PComputeCutting": 0.2897770404815674, + "PGLayoutTilingPipeline": 44.85792541503906, + "PGTiling": 5.324963092803955, + "PadElimination": 0.009048938751220703, + "ParAxesAnnotation": 29.229726791381836, + "PartialLoopFusion": 0.317737340927124, + "PartialSimdFusion": 0.2749216556549072, + "PerfectLoopNest": 0.06647539138793945, + "RecognizeOpIdiom": 0.21295762062072754, + "Recompute": 0.010057210922241211, + "RelaxPredicates": 0.16659164428710938, + "Rematerialization": 0.16863059997558594, + "ReshapeWeights": 0.023304462432861328, + "ResolveAccessConflict": 0.269362211227417, + "ResolveComplicatePredicates": 0.05520820617675781, + "RewriteReplicationMatmul": 0.05014157295227051, + "RewriteWeights": 0.06727242469787598, + "SFKVectorizer": 3.926682472229004, + "SimpleAllReduceTiling": 0.06993412971496582, + "Simplifier": 0.13543295860290527, + "SimplifyMacroPredicates": 0.20038676261901855, + "SimplifyNeuronTensor": 1.4075822830200195, + "SimplifySlice": 0.03729248046875, + "SimplifyTensor": 0.23219513893127441, + "SpillPSum": 0.3766024112701416, + "SplitAPUnionSets": 0.3479146957397461, + "SplitAccGrp": 0.04291415214538574, + "StaticProfiler": 0.1440873146057129, + "StaticTransposeLocalTensor": 0.25237345695495605, + "SundaISel": 1.9206316471099854, + "TCTransform": 0.03933572769165039, + "TensorInitialization": 0.14406371116638184, + "TensorOpSimplifier": 0.4716155529022217, + "TensorOpTransform": 1.603982925415039, + "TileCCOps": 0.23507094383239746, + "TilingProfiler": 0.4180028438568115, + "TransformConvOp": 0.07676315307617188, + "TritiumFusion": 1.224381923675537, + "ValueNumbering": 0.10977578163146973, + "VectorizeDMA": 0.03783679008483887, + "VectorizeMatMult": 0.023741960525512695, + "WeightCoalescing": 0.059252262115478516, + "ZeroSizeTensorElimination": 0.0004744529724121094 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 263005, + "StaticProfiler::AifUb": 10.638744354248047, + "StaticProfiler::ArithmeticIntensityTensorizer": 11.686018943786621, + "StaticProfiler::AverageDmaLength": 6504.53759765625, + "StaticProfiler::AverageFractalPeUtilization": 99.8356704711914, + "StaticProfiler::AveragePartitionUtilization": 99.36558532714844, + "StaticProfiler::AveragePeUtilization": 99.6281509399414, + "StaticProfiler::DDRTransferBytes": 7596639576, + "StaticProfiler::InternalTransferBytes": 637633556, + "StaticProfiler::LoadExpanded": 1061057, + "StaticProfiler::LocalizationEfficiency": 109.8439712524414, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 109.93106842041016, + "StaticProfiler::StoreExpanded": 3422, + "StaticProfiler::TotalDMAExpanded": 1064479, + "StaticProfiler::TotalDynamicInstancesCount": 276347, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 275718, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 79, + "TilingProfiler::MatMultInstructionsAfterTiling": 231696, + "TilingProfiler::NumPfTransposes": 398, + "TilingProfiler::NumPfTransposesForIo": 37, + "TilingProfiler::NumPfTransposesForLocal": 216, + "TilingProfiler::NumPfTransposesForNonlocal": 145, + "TilingProfiler::PfTransposeInstructions": 19729, + "TilingProfiler::PfTransposeInstructionsForIo": 19296, + "TilingProfiler::PfTransposeInstructionsForLocal": 288, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 145, + "TilingProfiler::ReduceInstructionsAfterTiling": 74, + "TilingProfiler::SimdInstructionsAfterTiling": 3035, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + } +} \ No newline at end of file diff --git a/token_generation_model/_tp0_bk1/graph.neff b/token_generation_model/_tp0_bk1/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..382ea79cb1de59673b9eebde39a0a7d7f0a2eaa7 --- /dev/null +++ b/token_generation_model/_tp0_bk1/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7766ebc9549de8407cdfbe1f261eb1990584c4f800bbdd332e5825276d7e8ba9 +size 6042624 diff --git a/token_generation_model/_tp0_bk1/log-neuron-cc.txt b/token_generation_model/_tp0_bk1/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..0da8e3636c8141f2b2573d58248b5b785fc5b60f --- /dev/null +++ b/token_generation_model/_tp0_bk1/log-neuron-cc.txt @@ -0,0 +1,2934 @@ +2025-08-07T13:53:51Z INFO 47983 [root]: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.hlo_module.pb --output /home/ubuntu/qwen3/token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma' --lnc=1 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=/home/ubuntu/qwen3/token_generation_model/_tp0_bk1/log-neuron-cc.txt --verbose=35 +2025-08-07T13:53:51Z INFO 47983 [root]: NeuronX Compiler version 2.20.9961.0+0acef03a Python version 3.10.12 HWM version 2.20.0.9961+0acef03a NumPy version 1.26.4 Running on AMI ami-040348201d80b58ad Running in region usw2-az4 +2025-08-07T13:53:51Z INFO 48500 [root]: XLA detected +2025-08-07T13:53:51Z INFO 48500 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-08-07T13:53:51Z INFO 48500 [root]: Intermediate files stored in /home/ubuntu/qwen3/token_generation_model/_tp0_bk1/neuronxcc-hdngl0fs, output in /home/ubuntu/qwen3/token_generation_model/_tp0_bk1 +2025-08-07T13:53:51Z INFO 48500 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-08-07T13:53:51Z INFO 48500 [pipeline.Pipeline.0]: Processing input #0 +2025-08-07T13:53:51Z INFO 48500 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-08-07T13:53:51Z INFO 48500 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-08-07T13:53:51Z INFO 48500 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-08-07T13:53:51Z INFO 48500 [job.HLOToTensorizer.0]: Processing input #0 +2025-08-07T13:53:51Z INFO 48500 [job.HLOToTensorizer.0]: IR signature: 9c565989bf644de18fb8b4dbcf5ae03d0be2bfe8bc7c9308e7954d0a9db691fc for model.MODULE_d608453625db6ed38994+e5eecdd4.hlo_module.pb +2025-08-07T13:53:51Z INFO 48500 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --layers-per-module=1 --emit-tensor-level-dropout-ops --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-08-07T13:53:51Z INFO 48500 [job.HLOToTensorizer.0]: DEBUG: needsModular? No. macCnt 3821944896 num non-trivial Ops 3786 +INFO: Switching to single-module compile. PrePartitionPipe skipped. +INFO: Found memory bound graph +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 2 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 3821871104 +INFO: Traffic has found 8267155053 +INFO: AIF 0.924592 +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call divide dot exponential gather get-tuple-element iota maximum multiply negate pad parameter reduce reshape rng scatter select sine slice subtract transpose tuple +Warning: Could not open file debug_info_hlo_partitions.json +2025-08-07 13:53:51.866764: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:52] Truncating long HLO operator name %tuple.13231 = tuple(%reshape.5201, %scatter.12235, %scatter.12250, %scatter.12263, %scatter.12278, %scatter.12291, %scatter.12306, %scatter.12319, %scatter.12334, %scatter.12347, %scatter.12362, %scatter.12375, %scatter.12390, %scatter.12403, %scatter.12418, %scatter.12431, %scatter.12446, %scatter.12459, %scatter.12474, %scatter.12487, %scatter.12502, %scatter.12515, %scatter.12530, %scatter.12543, %scatter.12558, %scatter.12571, %scatter.12586, %scatter.12599, %scatter.12614, %scatter.12627, %scatter.... to 512 characters in the compiler's debug metadata +Invoking RemoveOptimizationBarriers pass + +2025-08-07T13:53:51Z INFO 48500 [job.HLOToTensorizer.0]: IR signature: 32c0fc7cc8460b1d68936a42f1e45012024216806b83e6a0eb626b7280488b7d for sg0000/HLOToTensorizer +2025-08-07T13:53:52Z INFO 48500 [job.HLOToTensorizer.0]: Job #0 finished +2025-08-07T13:53:52Z INFO 48500 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-08-07T13:53:52Z INFO 48500 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-08-07T13:53:52Z INFO 48500 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-08-07T13:53:52Z INFO 48500 [job.Frontend.0]: Processing input #0 +2025-08-07T13:53:52Z INFO 48500 [job.Frontend.0]: Start model loading +2025-08-07T13:53:52Z INFO 48500 [job.Frontend.0]: Start tensorization +2025-08-07T13:53:52Z INFO 48500 [job.Frontend.0]: Num jobs: 1 +2025-08-07T13:53:52Z USER 48500 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-08-07T13:53:52Z INFO 48500 [Tensorizer]: Frontend did not find netlist info. Switching to flat flow. +2025-08-07T13:53:52Z INFO 48500 [Tensorizer]: Building model from Penguin script "penguin.py"... +2025-08-07T13:53:53Z INFO 48500 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --keep-remat-dma-transpose --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-08-07T13:53:53Z INFO 48500 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:53Z INFO 48500 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48500 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:53Z INFO 48500 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-08-07T13:53:53Z INFO 48500 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48500 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.040 seconds +2025-08-07T13:53:53Z INFO 48500 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-08-07T13:53:53Z INFO 48500 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48500 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.018 seconds +2025-08-07T13:53:53Z INFO 48500 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:53Z INFO 48500 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48500 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.134 seconds +2025-08-07T13:53:53Z INFO 48500 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-08-07T13:53:53Z INFO 48500 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48500 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.077 seconds +2025-08-07T13:53:53Z INFO 48500 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-08-07T13:53:54Z INFO 48500 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48500 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.523 seconds +2025-08-07T13:53:54Z INFO 48500 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:54Z INFO 48500 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:54Z INFO 48500 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48500 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 48500 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:54Z INFO 48500 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48500 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.384 seconds +2025-08-07T13:53:54Z INFO 48500 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.419 seconds +2025-08-07T13:53:54Z INFO 48500 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.472 seconds +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.074 seconds +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.085 seconds +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.055 seconds +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.056 seconds +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.198 seconds +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.059 seconds +2025-08-07T13:53:55Z INFO 48500 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:56Z INFO 48500 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48500 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.800 seconds +2025-08-07T13:53:56Z INFO 48500 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:56Z INFO 48500 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48500 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.056 seconds +2025-08-07T13:53:56Z INFO 48500 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:56Z INFO 48500 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48500 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.059 seconds +2025-08-07T13:53:56Z INFO 48500 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:56Z INFO 48500 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48500 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.058 seconds +2025-08-07T13:53:56Z INFO 48500 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-08-07T13:53:56Z INFO 48500 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48500 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.070 seconds +2025-08-07T13:53:56Z INFO 48500 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:57Z INFO 48500 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48500 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.058 seconds +2025-08-07T13:53:57Z INFO 48500 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:57Z INFO 48500 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48500 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.185 seconds +2025-08-07T13:53:57Z INFO 48500 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:57Z INFO 48500 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48500 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.059 seconds +2025-08-07T13:53:57Z INFO 48500 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-08-07T13:53:58Z INFO 48500 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 48500 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 1.604 seconds +2025-08-07T13:53:58Z INFO 48500 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-08-07T13:53:59Z INFO 48500 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 48500 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.393 seconds +2025-08-07T13:53:59Z INFO 48500 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:59Z INFO 48500 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:59Z INFO 48500 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 48500 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.006 seconds +2025-08-07T13:53:59Z INFO 48500 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:59Z INFO 48500 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 48500 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.441 seconds +2025-08-07T13:53:59Z INFO 48500 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.467 seconds +2025-08-07T13:53:59Z INFO 48500 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-08-07T13:54:04Z INFO 48500 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-08-07T13:54:04Z INFO 48500 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 5.041 seconds +2025-08-07T13:54:04Z INFO 48500 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:54:06Z INFO 48500 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:54:06Z INFO 48500 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 2.096 seconds +2025-08-07T13:54:06Z INFO 48500 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-08-07T13:54:07Z INFO 48500 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-08-07T13:54:07Z INFO 48500 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.169 seconds +2025-08-07T13:54:07Z INFO 48500 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:54:07Z INFO 48500 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:54:07Z INFO 48500 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.407 seconds +2025-08-07T13:54:07Z INFO 48500 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:54:08Z INFO 48500 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:54:08Z INFO 48500 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.520 seconds +2025-08-07T13:54:08Z INFO 48500 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:54:09Z INFO 48500 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=True) +2025-08-07T13:54:09Z INFO 48500 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 1.521 seconds +2025-08-07T13:54:09Z INFO 48500 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:54:09Z INFO 48500 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:54:09Z INFO 48500 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.134 seconds +2025-08-07T13:54:09Z INFO 48500 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:54:09Z INFO 48500 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:54:09Z INFO 48500 [sg0000/Tensorizer/LICM]: LICM finished after 0.088 seconds +2025-08-07T13:54:09Z INFO 48500 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:54:09Z INFO 48500 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:54:09Z INFO 48500 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.157 seconds +2025-08-07T13:54:09Z INFO 48500 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:54:10Z INFO 48500 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:54:10Z INFO 48500 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.546 seconds +2025-08-07T13:54:10Z INFO 48500 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-08-07T13:54:10Z INFO 48500 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-08-07T13:54:10Z INFO 48500 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.037 seconds +2025-08-07T13:54:10Z INFO 48500 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:54:10Z INFO 48500 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:54:10Z INFO 48500 [sg0000/Tensorizer/LICM]: LICM finished after 0.067 seconds +2025-08-07T13:54:10Z INFO 48500 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:54:10Z INFO 48500 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:54:10Z INFO 48500 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.257 seconds +2025-08-07T13:54:10Z INFO 48500 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.140 seconds +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/LICM]: LICM finished after 0.064 seconds +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.009 seconds +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.152 seconds +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.338 seconds +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.036 seconds +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.127 seconds +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/LICM]: LICM finished after 0.067 seconds +2025-08-07T13:54:11Z INFO 48500 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.110 seconds +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.039 seconds +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.037 seconds +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.213 seconds +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.187 seconds +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.414 seconds +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.010 seconds +2025-08-07T13:54:12Z INFO 48500 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.046 seconds +2025-08-07T13:54:13Z INFO 48500 [Tensorizer]: After optimization: 1185 statements +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.058 seconds +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.037 seconds +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.135 seconds +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=8192 is not above min_allgather_tile_size_in_bytes=8388608` +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/TileCCOps]: in bfloat16 (4096,) %'all_gather.1' = AllGatherOp-502 AllGather_add(bfloat16 (2048,) %'gather.1', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((4096,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.47 | hlo_id: 47 | , id = 502 +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/TileCCOps]: in float32 (512,) %'all_gather.2' = AllGatherOp-10901 AllGather_add(float32 (256,) %'add.217', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.12066 | hlo_id: 12066 | , id = 10901 +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=2048 is not above min_allgather_tile_size_in_bytes=8388608` +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/TileCCOps]: in uint32 (512,) %'all_gather.3' = AllGatherOp-10917 AllGather_add(uint32 (256,) %'add.218', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.12201 | hlo_id: 12201 | , id = 10917 +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/TileCCOps]: Finished (changed=False) +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.235 seconds +2025-08-07T13:54:13Z INFO 48500 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:54:14Z INFO 48500 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-08-07T13:54:14Z INFO 48500 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.504 seconds +2025-08-07T13:54:14Z INFO 48500 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:54:14Z INFO 48500 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:54:14Z INFO 48500 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.147 seconds +2025-08-07T13:54:14Z INFO 48500 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:54:14Z INFO 48500 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:54:14Z INFO 48500 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.338 seconds +2025-08-07T13:54:14Z INFO 48500 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:54:14Z INFO 48500 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:54:14Z INFO 48500 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.040 seconds +2025-08-07T13:54:14Z INFO 48500 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-08-07T13:54:14Z INFO 48500 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-08-07T13:54:14Z INFO 48500 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.048 seconds +2025-08-07T13:54:14Z INFO 48500 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-08-07T13:54:15Z INFO 48500 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-08-07T13:54:15Z INFO 48500 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.487 seconds +2025-08-07T13:54:15Z INFO 48500 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-08-07T13:54:15Z INFO 48500 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=True) +2025-08-07T13:54:15Z INFO 48500 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.269 seconds +2025-08-07T13:54:15Z INFO 48500 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:54:15Z INFO 48500 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:54:15Z INFO 48500 [sg0000/Tensorizer/LICM]: LICM finished after 0.076 seconds +2025-08-07T13:54:15Z INFO 48500 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-08-07T13:54:15Z INFO 48500 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=False) +2025-08-07T13:54:15Z INFO 48500 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.411 seconds +2025-08-07T13:54:15Z INFO 48500 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:54:16Z INFO 48500 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:54:16Z INFO 48500 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.356 seconds +2025-08-07T13:54:16Z INFO 48500 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-08-07T13:54:16Z INFO 48500 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:54:16Z INFO 48500 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:54:16Z INFO 48500 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.216 seconds +2025-08-07T13:54:16Z INFO 48500 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-08-07T13:54:16Z INFO 48500 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-08-07T13:54:16Z INFO 48500 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:54:17Z INFO 48500 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:54:17Z INFO 48500 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.243 seconds +2025-08-07T13:54:17Z INFO 48500 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-08-07T13:54:17Z INFO 48500 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 1.127 seconds +2025-08-07T13:54:17Z INFO 48500 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-08-07T13:54:18Z INFO 48500 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.372 seconds +2025-08-07T13:54:18Z INFO 48500 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 1.512 seconds +2025-08-07T13:54:18Z INFO 48500 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-08-07T13:54:18Z INFO 48500 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:54:19Z INFO 48500 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:54:22Z INFO 48500 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-08-07T13:54:22Z INFO 48500 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 4.488 seconds +2025-08-07T13:54:22Z INFO 48500 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-08-07T13:54:22Z INFO 48500 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-08-07T13:54:22Z INFO 48500 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-08-07T13:54:51Z INFO 48500 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-08-07T13:54:51Z INFO 48500 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 29.230 seconds +2025-08-07T13:54:51Z INFO 48500 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-08-07T13:54:52Z INFO 48500 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-08-07T13:54:52Z INFO 48500 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 1.114 seconds +2025-08-07T13:54:52Z INFO 48500 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 30.353 seconds +2025-08-07T13:54:52Z INFO 48500 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:54:53Z INFO 48500 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:54:53Z INFO 48500 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.155 seconds +2025-08-07T13:54:53Z INFO 48500 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-08-07T13:54:53Z INFO 48500 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-08-07T13:54:53Z INFO 48500 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.229 seconds +2025-08-07T13:54:53Z INFO 48500 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:54:53Z INFO 48500 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:54:53Z INFO 48500 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.248 seconds +2025-08-07T13:54:53Z INFO 48500 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-08-07T13:54:53Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11158 of IO tensor {'CrossPassTensor': ''}bfloat16 %input4|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(1, 'AG2839'), (260, 'AG2833'), (152, 'AG2837')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11435 of IO tensor {'CrossPassTensor': ''}bfloat16 %input6|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(3, 'AG2853'), (260, 'AG2833'), (155, 'AG2851')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11686 of IO tensor {'CrossPassTensor': ''}bfloat16 %input8|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(5, 'AG2865'), (260, 'AG2833'), (158, 'AG2863')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11937 of IO tensor {'CrossPassTensor': ''}bfloat16 %input10|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(7, 'AG2877'), (260, 'AG2833'), (161, 'AG2875')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12188 of IO tensor {'CrossPassTensor': ''}bfloat16 %input12|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(9, 'AG2889'), (260, 'AG2833'), (164, 'AG2887')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12439 of IO tensor {'CrossPassTensor': ''}bfloat16 %input14|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(11, 'AG2901'), (260, 'AG2833'), (167, 'AG2899')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12690 of IO tensor {'CrossPassTensor': ''}bfloat16 %input16|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(13, 'AG2913'), (260, 'AG2833'), (170, 'AG2911')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12941 of IO tensor {'CrossPassTensor': ''}bfloat16 %input18|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(15, 'AG2925'), (260, 'AG2833'), (173, 'AG2923')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13192 of IO tensor {'CrossPassTensor': ''}bfloat16 %input20|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(17, 'AG2937'), (260, 'AG2833'), (176, 'AG2935')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13443 of IO tensor {'CrossPassTensor': ''}bfloat16 %input22|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(19, 'AG2949'), (260, 'AG2833'), (179, 'AG2947')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13694 of IO tensor {'CrossPassTensor': ''}bfloat16 %input24|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(21, 'AG2961'), (260, 'AG2833'), (182, 'AG2959')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13945 of IO tensor {'CrossPassTensor': ''}bfloat16 %input26|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(23, 'AG2973'), (260, 'AG2833'), (185, 'AG2971')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14196 of IO tensor {'CrossPassTensor': ''}bfloat16 %input28|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(25, 'AG2985'), (260, 'AG2833'), (188, 'AG2983')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14447 of IO tensor {'CrossPassTensor': ''}bfloat16 %input30|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(27, 'AG2997'), (260, 'AG2833'), (191, 'AG2995')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14698 of IO tensor {'CrossPassTensor': ''}bfloat16 %input32|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(29, 'AG3009'), (260, 'AG2833'), (194, 'AG3007')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14949 of IO tensor {'CrossPassTensor': ''}bfloat16 %input34|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(31, 'AG3021'), (260, 'AG2833'), (197, 'AG3019')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15200 of IO tensor {'CrossPassTensor': ''}bfloat16 %input36|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(33, 'AG3033'), (260, 'AG2833'), (200, 'AG3031')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15451 of IO tensor {'CrossPassTensor': ''}bfloat16 %input38|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(35, 'AG3045'), (260, 'AG2833'), (203, 'AG3043')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15702 of IO tensor {'CrossPassTensor': ''}bfloat16 %input40|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(37, 'AG3057'), (260, 'AG2833'), (206, 'AG3055')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15953 of IO tensor {'CrossPassTensor': ''}bfloat16 %input42|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(39, 'AG3069'), (260, 'AG2833'), (209, 'AG3067')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16204 of IO tensor {'CrossPassTensor': ''}bfloat16 %input44|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(41, 'AG3081'), (260, 'AG2833'), (212, 'AG3079')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16455 of IO tensor {'CrossPassTensor': ''}bfloat16 %input46|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(43, 'AG3093'), (260, 'AG2833'), (215, 'AG3091')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16706 of IO tensor {'CrossPassTensor': ''}bfloat16 %input48|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(45, 'AG3105'), (260, 'AG2833'), (218, 'AG3103')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16957 of IO tensor {'CrossPassTensor': ''}bfloat16 %input50|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(47, 'AG3117'), (260, 'AG2833'), (221, 'AG3115')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17208 of IO tensor {'CrossPassTensor': ''}bfloat16 %input52|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(49, 'AG3129'), (260, 'AG2833'), (224, 'AG3127')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17459 of IO tensor {'CrossPassTensor': ''}bfloat16 %input54|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(51, 'AG3141'), (260, 'AG2833'), (227, 'AG3139')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17710 of IO tensor {'CrossPassTensor': ''}bfloat16 %input56|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(53, 'AG3153'), (260, 'AG2833'), (230, 'AG3151')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17961 of IO tensor {'CrossPassTensor': ''}bfloat16 %input58|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(55, 'AG3165'), (260, 'AG2833'), (233, 'AG3163')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18212 of IO tensor {'CrossPassTensor': ''}bfloat16 %input60|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(57, 'AG3177'), (260, 'AG2833'), (236, 'AG3175')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18463 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(59, 'AG3189'), (260, 'AG2833'), (239, 'AG3187')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18714 of IO tensor {'CrossPassTensor': ''}bfloat16 %input64|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(61, 'AG3201'), (260, 'AG2833'), (242, 'AG3199')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18965 of IO tensor {'CrossPassTensor': ''}bfloat16 %input66|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(63, 'AG3213'), (260, 'AG2833'), (245, 'AG3211')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19216 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(65, 'AG3225'), (260, 'AG2833'), (248, 'AG3223')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19467 of IO tensor {'CrossPassTensor': ''}bfloat16 %input70|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(67, 'AG3237'), (260, 'AG2833'), (251, 'AG3235')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19718 of IO tensor {'CrossPassTensor': ''}bfloat16 %input72|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(69, 'AG3249'), (260, 'AG2833'), (254, 'AG3247')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19969 of IO tensor {'CrossPassTensor': ''}bfloat16 %input74|NHWC|(1, 4, 4, 256, 2, 64) is not sorted, index list (w/ AG ids): [(71, 'AG3261'), (260, 'AG2833'), (257, 'AG3259')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11309 of IO tensor {'CrossPassTensor': ''}bfloat16 %input77(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(153, 'AG2843'), (1, 'AG2839'), (80, 'AG2838'), (264, 'AG2842'), (409, 'AG2841')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28661 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(408, 'AG2834'), (261, 'AG2835')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28652 of IO tensor {'CrossPassTensor': ''}bfloat16 %input79|NC|(128, 32) is not sorted, index list (w/ AG ids): [(408, 'AG2834'), (261, 'AG2835')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28658 of IO tensor {'CrossPassTensor': ''}bfloat16 %input81(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(408, 'AG2834'), (261, 'AG2835')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28653 of IO tensor {'CrossPassTensor': ''}bfloat16 %input83(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(408, 'AG2834'), (261, 'AG2835')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28666 of IO tensor {'CrossPassTensor': ''}bfloat16 %input84(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(81, 'AG2848'), (266, 'AG2846'), (154, 'AG2847')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28665 of IO tensor {'CrossPassTensor': ''}bfloat16 %input85|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28663 of IO tensor {'CrossPassTensor': ''}bfloat16 %input86|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28664 of IO tensor {'CrossPassTensor': ''}bfloat16 %input87|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11582 of IO tensor {'CrossPassTensor': ''}bfloat16 %input88(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(156, 'AG2857'), (3, 'AG2853'), (82, 'AG2852'), (269, 'AG2856'), (411, 'AG2855')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28674 of IO tensor {'CrossPassTensor': ''}bfloat16 %input89|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28667 of IO tensor {'CrossPassTensor': ''}bfloat16 %input90|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28671 of IO tensor {'CrossPassTensor': ''}bfloat16 %input92(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28668 of IO tensor {'CrossPassTensor': ''}bfloat16 %input94(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28679 of IO tensor {'CrossPassTensor': ''}bfloat16 %input95(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(83, 'AG2860'), (270, 'AG2858'), (157, 'AG2859')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28678 of IO tensor {'CrossPassTensor': ''}bfloat16 %input96|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28676 of IO tensor {'CrossPassTensor': ''}bfloat16 %input97|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28677 of IO tensor {'CrossPassTensor': ''}bfloat16 %input98|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11833 of IO tensor {'CrossPassTensor': ''}bfloat16 %input99(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(159, 'AG2869'), (5, 'AG2865'), (84, 'AG2864'), (273, 'AG2868'), (412, 'AG2867')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28687 of IO tensor {'CrossPassTensor': ''}bfloat16 %input100|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28680 of IO tensor {'CrossPassTensor': ''}bfloat16 %input101|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28684 of IO tensor {'CrossPassTensor': ''}bfloat16 %input103(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28681 of IO tensor {'CrossPassTensor': ''}bfloat16 %input105(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28692 of IO tensor {'CrossPassTensor': ''}bfloat16 %input106(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(85, 'AG2872'), (274, 'AG2870'), (160, 'AG2871')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28691 of IO tensor {'CrossPassTensor': ''}bfloat16 %input107|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28689 of IO tensor {'CrossPassTensor': ''}bfloat16 %input108|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28690 of IO tensor {'CrossPassTensor': ''}bfloat16 %input109|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12084 of IO tensor {'CrossPassTensor': ''}bfloat16 %input110(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(162, 'AG2881'), (7, 'AG2877'), (86, 'AG2876'), (277, 'AG2880'), (413, 'AG2879')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28700 of IO tensor {'CrossPassTensor': ''}bfloat16 %input111|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28693 of IO tensor {'CrossPassTensor': ''}bfloat16 %input112|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28697 of IO tensor {'CrossPassTensor': ''}bfloat16 %input114(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28694 of IO tensor {'CrossPassTensor': ''}bfloat16 %input116(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28705 of IO tensor {'CrossPassTensor': ''}bfloat16 %input117(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(87, 'AG2884'), (278, 'AG2882'), (163, 'AG2883')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28704 of IO tensor {'CrossPassTensor': ''}bfloat16 %input118|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28702 of IO tensor {'CrossPassTensor': ''}bfloat16 %input119|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28703 of IO tensor {'CrossPassTensor': ''}bfloat16 %input120|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12335 of IO tensor {'CrossPassTensor': ''}bfloat16 %input121(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(165, 'AG2893'), (9, 'AG2889'), (88, 'AG2888'), (281, 'AG2892'), (414, 'AG2891')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28713 of IO tensor {'CrossPassTensor': ''}bfloat16 %input122|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28706 of IO tensor {'CrossPassTensor': ''}bfloat16 %input123|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28710 of IO tensor {'CrossPassTensor': ''}bfloat16 %input125(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28707 of IO tensor {'CrossPassTensor': ''}bfloat16 %input127(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28718 of IO tensor {'CrossPassTensor': ''}bfloat16 %input128(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(89, 'AG2896'), (282, 'AG2894'), (166, 'AG2895')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28717 of IO tensor {'CrossPassTensor': ''}bfloat16 %input129|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28715 of IO tensor {'CrossPassTensor': ''}bfloat16 %input130|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28716 of IO tensor {'CrossPassTensor': ''}bfloat16 %input131|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12586 of IO tensor {'CrossPassTensor': ''}bfloat16 %input132(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(168, 'AG2905'), (11, 'AG2901'), (90, 'AG2900'), (285, 'AG2904'), (415, 'AG2903')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28726 of IO tensor {'CrossPassTensor': ''}bfloat16 %input133|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28719 of IO tensor {'CrossPassTensor': ''}bfloat16 %input134|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28723 of IO tensor {'CrossPassTensor': ''}bfloat16 %input136(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28720 of IO tensor {'CrossPassTensor': ''}bfloat16 %input138(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28731 of IO tensor {'CrossPassTensor': ''}bfloat16 %input139(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(91, 'AG2908'), (286, 'AG2906'), (169, 'AG2907')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28730 of IO tensor {'CrossPassTensor': ''}bfloat16 %input140|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28728 of IO tensor {'CrossPassTensor': ''}bfloat16 %input141|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28729 of IO tensor {'CrossPassTensor': ''}bfloat16 %input142|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12837 of IO tensor {'CrossPassTensor': ''}bfloat16 %input143(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(171, 'AG2917'), (13, 'AG2913'), (92, 'AG2912'), (289, 'AG2916'), (416, 'AG2915')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28739 of IO tensor {'CrossPassTensor': ''}bfloat16 %input144|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28732 of IO tensor {'CrossPassTensor': ''}bfloat16 %input145|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28736 of IO tensor {'CrossPassTensor': ''}bfloat16 %input147(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28733 of IO tensor {'CrossPassTensor': ''}bfloat16 %input149(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28744 of IO tensor {'CrossPassTensor': ''}bfloat16 %input150(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(93, 'AG2920'), (290, 'AG2918'), (172, 'AG2919')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28743 of IO tensor {'CrossPassTensor': ''}bfloat16 %input151|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28741 of IO tensor {'CrossPassTensor': ''}bfloat16 %input152|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28742 of IO tensor {'CrossPassTensor': ''}bfloat16 %input153|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13088 of IO tensor {'CrossPassTensor': ''}bfloat16 %input154(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(174, 'AG2929'), (15, 'AG2925'), (94, 'AG2924'), (293, 'AG2928'), (417, 'AG2927')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28752 of IO tensor {'CrossPassTensor': ''}bfloat16 %input155|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28745 of IO tensor {'CrossPassTensor': ''}bfloat16 %input156|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28749 of IO tensor {'CrossPassTensor': ''}bfloat16 %input158(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28746 of IO tensor {'CrossPassTensor': ''}bfloat16 %input160(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28757 of IO tensor {'CrossPassTensor': ''}bfloat16 %input161(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(95, 'AG2932'), (294, 'AG2930'), (175, 'AG2931')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28756 of IO tensor {'CrossPassTensor': ''}bfloat16 %input162|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28754 of IO tensor {'CrossPassTensor': ''}bfloat16 %input163|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28755 of IO tensor {'CrossPassTensor': ''}bfloat16 %input164|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13339 of IO tensor {'CrossPassTensor': ''}bfloat16 %input165(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(177, 'AG2941'), (17, 'AG2937'), (96, 'AG2936'), (297, 'AG2940'), (418, 'AG2939')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28765 of IO tensor {'CrossPassTensor': ''}bfloat16 %input166|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28758 of IO tensor {'CrossPassTensor': ''}bfloat16 %input167|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28762 of IO tensor {'CrossPassTensor': ''}bfloat16 %input169(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28759 of IO tensor {'CrossPassTensor': ''}bfloat16 %input171(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28770 of IO tensor {'CrossPassTensor': ''}bfloat16 %input172(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(97, 'AG2944'), (298, 'AG2942'), (178, 'AG2943')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28769 of IO tensor {'CrossPassTensor': ''}bfloat16 %input173|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28767 of IO tensor {'CrossPassTensor': ''}bfloat16 %input174|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28768 of IO tensor {'CrossPassTensor': ''}bfloat16 %input175|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13590 of IO tensor {'CrossPassTensor': ''}bfloat16 %input176(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(180, 'AG2953'), (19, 'AG2949'), (98, 'AG2948'), (301, 'AG2952'), (419, 'AG2951')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28778 of IO tensor {'CrossPassTensor': ''}bfloat16 %input177|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28771 of IO tensor {'CrossPassTensor': ''}bfloat16 %input178|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28775 of IO tensor {'CrossPassTensor': ''}bfloat16 %input180(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28772 of IO tensor {'CrossPassTensor': ''}bfloat16 %input182(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28783 of IO tensor {'CrossPassTensor': ''}bfloat16 %input183(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(99, 'AG2956'), (302, 'AG2954'), (181, 'AG2955')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28782 of IO tensor {'CrossPassTensor': ''}bfloat16 %input184|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28780 of IO tensor {'CrossPassTensor': ''}bfloat16 %input185|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28781 of IO tensor {'CrossPassTensor': ''}bfloat16 %input186|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13841 of IO tensor {'CrossPassTensor': ''}bfloat16 %input187(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(183, 'AG2965'), (21, 'AG2961'), (100, 'AG2960'), (305, 'AG2964'), (420, 'AG2963')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28791 of IO tensor {'CrossPassTensor': ''}bfloat16 %input188|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28784 of IO tensor {'CrossPassTensor': ''}bfloat16 %input189|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28788 of IO tensor {'CrossPassTensor': ''}bfloat16 %input191(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28785 of IO tensor {'CrossPassTensor': ''}bfloat16 %input193(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28796 of IO tensor {'CrossPassTensor': ''}bfloat16 %input194(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(101, 'AG2968'), (306, 'AG2966'), (184, 'AG2967')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28795 of IO tensor {'CrossPassTensor': ''}bfloat16 %input195|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28793 of IO tensor {'CrossPassTensor': ''}bfloat16 %input196|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28794 of IO tensor {'CrossPassTensor': ''}bfloat16 %input197|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14092 of IO tensor {'CrossPassTensor': ''}bfloat16 %input198(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(186, 'AG2977'), (23, 'AG2973'), (102, 'AG2972'), (309, 'AG2976'), (421, 'AG2975')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28804 of IO tensor {'CrossPassTensor': ''}bfloat16 %input199|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28797 of IO tensor {'CrossPassTensor': ''}bfloat16 %input200|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28801 of IO tensor {'CrossPassTensor': ''}bfloat16 %input202(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28798 of IO tensor {'CrossPassTensor': ''}bfloat16 %input204(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28809 of IO tensor {'CrossPassTensor': ''}bfloat16 %input205(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(103, 'AG2980'), (310, 'AG2978'), (187, 'AG2979')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28808 of IO tensor {'CrossPassTensor': ''}bfloat16 %input206|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28806 of IO tensor {'CrossPassTensor': ''}bfloat16 %input207|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28807 of IO tensor {'CrossPassTensor': ''}bfloat16 %input208|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14343 of IO tensor {'CrossPassTensor': ''}bfloat16 %input209(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(189, 'AG2989'), (25, 'AG2985'), (104, 'AG2984'), (313, 'AG2988'), (422, 'AG2987')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28817 of IO tensor {'CrossPassTensor': ''}bfloat16 %input210|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28810 of IO tensor {'CrossPassTensor': ''}bfloat16 %input211|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28814 of IO tensor {'CrossPassTensor': ''}bfloat16 %input213(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28811 of IO tensor {'CrossPassTensor': ''}bfloat16 %input215(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28822 of IO tensor {'CrossPassTensor': ''}bfloat16 %input216(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(105, 'AG2992'), (314, 'AG2990'), (190, 'AG2991')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28821 of IO tensor {'CrossPassTensor': ''}bfloat16 %input217|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28819 of IO tensor {'CrossPassTensor': ''}bfloat16 %input218|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28820 of IO tensor {'CrossPassTensor': ''}bfloat16 %input219|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14594 of IO tensor {'CrossPassTensor': ''}bfloat16 %input220(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(192, 'AG3001'), (27, 'AG2997'), (106, 'AG2996'), (317, 'AG3000'), (423, 'AG2999')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28830 of IO tensor {'CrossPassTensor': ''}bfloat16 %input221|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28823 of IO tensor {'CrossPassTensor': ''}bfloat16 %input222|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28827 of IO tensor {'CrossPassTensor': ''}bfloat16 %input224(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28824 of IO tensor {'CrossPassTensor': ''}bfloat16 %input226(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28835 of IO tensor {'CrossPassTensor': ''}bfloat16 %input227(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(107, 'AG3004'), (318, 'AG3002'), (193, 'AG3003')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28834 of IO tensor {'CrossPassTensor': ''}bfloat16 %input228|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28832 of IO tensor {'CrossPassTensor': ''}bfloat16 %input229|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28833 of IO tensor {'CrossPassTensor': ''}bfloat16 %input230|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14845 of IO tensor {'CrossPassTensor': ''}bfloat16 %input231(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(195, 'AG3013'), (29, 'AG3009'), (108, 'AG3008'), (321, 'AG3012'), (424, 'AG3011')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28843 of IO tensor {'CrossPassTensor': ''}bfloat16 %input232|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28836 of IO tensor {'CrossPassTensor': ''}bfloat16 %input233|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28840 of IO tensor {'CrossPassTensor': ''}bfloat16 %input235(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28837 of IO tensor {'CrossPassTensor': ''}bfloat16 %input237(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28848 of IO tensor {'CrossPassTensor': ''}bfloat16 %input238(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(109, 'AG3016'), (322, 'AG3014'), (196, 'AG3015')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28847 of IO tensor {'CrossPassTensor': ''}bfloat16 %input239|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28845 of IO tensor {'CrossPassTensor': ''}bfloat16 %input240|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28846 of IO tensor {'CrossPassTensor': ''}bfloat16 %input241|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15096 of IO tensor {'CrossPassTensor': ''}bfloat16 %input242(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(198, 'AG3025'), (31, 'AG3021'), (110, 'AG3020'), (325, 'AG3024'), (425, 'AG3023')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28856 of IO tensor {'CrossPassTensor': ''}bfloat16 %input243|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28849 of IO tensor {'CrossPassTensor': ''}bfloat16 %input244|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28853 of IO tensor {'CrossPassTensor': ''}bfloat16 %input246(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28850 of IO tensor {'CrossPassTensor': ''}bfloat16 %input248(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28861 of IO tensor {'CrossPassTensor': ''}bfloat16 %input249(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(111, 'AG3028'), (326, 'AG3026'), (199, 'AG3027')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28860 of IO tensor {'CrossPassTensor': ''}bfloat16 %input250|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28858 of IO tensor {'CrossPassTensor': ''}bfloat16 %input251|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28859 of IO tensor {'CrossPassTensor': ''}bfloat16 %input252|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15347 of IO tensor {'CrossPassTensor': ''}bfloat16 %input253(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(201, 'AG3037'), (33, 'AG3033'), (112, 'AG3032'), (329, 'AG3036'), (426, 'AG3035')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28869 of IO tensor {'CrossPassTensor': ''}bfloat16 %input254|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28862 of IO tensor {'CrossPassTensor': ''}bfloat16 %input255|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28866 of IO tensor {'CrossPassTensor': ''}bfloat16 %input257(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28863 of IO tensor {'CrossPassTensor': ''}bfloat16 %input259(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28874 of IO tensor {'CrossPassTensor': ''}bfloat16 %input260(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(113, 'AG3040'), (330, 'AG3038'), (202, 'AG3039')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28873 of IO tensor {'CrossPassTensor': ''}bfloat16 %input261|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28871 of IO tensor {'CrossPassTensor': ''}bfloat16 %input262|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28872 of IO tensor {'CrossPassTensor': ''}bfloat16 %input263|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15598 of IO tensor {'CrossPassTensor': ''}bfloat16 %input264(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(204, 'AG3049'), (35, 'AG3045'), (114, 'AG3044'), (333, 'AG3048'), (427, 'AG3047')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28882 of IO tensor {'CrossPassTensor': ''}bfloat16 %input265|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28875 of IO tensor {'CrossPassTensor': ''}bfloat16 %input266|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28879 of IO tensor {'CrossPassTensor': ''}bfloat16 %input268(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28876 of IO tensor {'CrossPassTensor': ''}bfloat16 %input270(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28887 of IO tensor {'CrossPassTensor': ''}bfloat16 %input271(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(115, 'AG3052'), (334, 'AG3050'), (205, 'AG3051')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28886 of IO tensor {'CrossPassTensor': ''}bfloat16 %input272|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28884 of IO tensor {'CrossPassTensor': ''}bfloat16 %input273|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28885 of IO tensor {'CrossPassTensor': ''}bfloat16 %input274|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15849 of IO tensor {'CrossPassTensor': ''}bfloat16 %input275(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(207, 'AG3061'), (37, 'AG3057'), (116, 'AG3056'), (337, 'AG3060'), (428, 'AG3059')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28895 of IO tensor {'CrossPassTensor': ''}bfloat16 %input276|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28888 of IO tensor {'CrossPassTensor': ''}bfloat16 %input277|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28892 of IO tensor {'CrossPassTensor': ''}bfloat16 %input279(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28889 of IO tensor {'CrossPassTensor': ''}bfloat16 %input281(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28900 of IO tensor {'CrossPassTensor': ''}bfloat16 %input282(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(117, 'AG3064'), (338, 'AG3062'), (208, 'AG3063')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28899 of IO tensor {'CrossPassTensor': ''}bfloat16 %input283|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28897 of IO tensor {'CrossPassTensor': ''}bfloat16 %input284|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28898 of IO tensor {'CrossPassTensor': ''}bfloat16 %input285|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16100 of IO tensor {'CrossPassTensor': ''}bfloat16 %input286(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(210, 'AG3073'), (39, 'AG3069'), (118, 'AG3068'), (341, 'AG3072'), (429, 'AG3071')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28908 of IO tensor {'CrossPassTensor': ''}bfloat16 %input287|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28901 of IO tensor {'CrossPassTensor': ''}bfloat16 %input288|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28905 of IO tensor {'CrossPassTensor': ''}bfloat16 %input290(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28902 of IO tensor {'CrossPassTensor': ''}bfloat16 %input292(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28913 of IO tensor {'CrossPassTensor': ''}bfloat16 %input293(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(119, 'AG3076'), (342, 'AG3074'), (211, 'AG3075')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28912 of IO tensor {'CrossPassTensor': ''}bfloat16 %input294|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28910 of IO tensor {'CrossPassTensor': ''}bfloat16 %input295|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28911 of IO tensor {'CrossPassTensor': ''}bfloat16 %input296|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16351 of IO tensor {'CrossPassTensor': ''}bfloat16 %input297(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(213, 'AG3085'), (41, 'AG3081'), (120, 'AG3080'), (345, 'AG3084'), (430, 'AG3083')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28921 of IO tensor {'CrossPassTensor': ''}bfloat16 %input298|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28914 of IO tensor {'CrossPassTensor': ''}bfloat16 %input299|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28918 of IO tensor {'CrossPassTensor': ''}bfloat16 %input301(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28915 of IO tensor {'CrossPassTensor': ''}bfloat16 %input303(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28926 of IO tensor {'CrossPassTensor': ''}bfloat16 %input304(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(121, 'AG3088'), (346, 'AG3086'), (214, 'AG3087')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28925 of IO tensor {'CrossPassTensor': ''}bfloat16 %input305|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28923 of IO tensor {'CrossPassTensor': ''}bfloat16 %input306|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28924 of IO tensor {'CrossPassTensor': ''}bfloat16 %input307|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16602 of IO tensor {'CrossPassTensor': ''}bfloat16 %input308(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(216, 'AG3097'), (43, 'AG3093'), (122, 'AG3092'), (349, 'AG3096'), (431, 'AG3095')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28934 of IO tensor {'CrossPassTensor': ''}bfloat16 %input309|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28927 of IO tensor {'CrossPassTensor': ''}bfloat16 %input310|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28931 of IO tensor {'CrossPassTensor': ''}bfloat16 %input312(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28928 of IO tensor {'CrossPassTensor': ''}bfloat16 %input314(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28939 of IO tensor {'CrossPassTensor': ''}bfloat16 %input315(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(123, 'AG3100'), (350, 'AG3098'), (217, 'AG3099')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28938 of IO tensor {'CrossPassTensor': ''}bfloat16 %input316|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28936 of IO tensor {'CrossPassTensor': ''}bfloat16 %input317|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28937 of IO tensor {'CrossPassTensor': ''}bfloat16 %input318|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16853 of IO tensor {'CrossPassTensor': ''}bfloat16 %input319(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(219, 'AG3109'), (45, 'AG3105'), (124, 'AG3104'), (353, 'AG3108'), (432, 'AG3107')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28947 of IO tensor {'CrossPassTensor': ''}bfloat16 %input320|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28940 of IO tensor {'CrossPassTensor': ''}bfloat16 %input321|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28944 of IO tensor {'CrossPassTensor': ''}bfloat16 %input323(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28941 of IO tensor {'CrossPassTensor': ''}bfloat16 %input325(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28952 of IO tensor {'CrossPassTensor': ''}bfloat16 %input326(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(125, 'AG3112'), (354, 'AG3110'), (220, 'AG3111')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28951 of IO tensor {'CrossPassTensor': ''}bfloat16 %input327|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28949 of IO tensor {'CrossPassTensor': ''}bfloat16 %input328|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28950 of IO tensor {'CrossPassTensor': ''}bfloat16 %input329|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17104 of IO tensor {'CrossPassTensor': ''}bfloat16 %input330(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(222, 'AG3121'), (47, 'AG3117'), (126, 'AG3116'), (357, 'AG3120'), (433, 'AG3119')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28960 of IO tensor {'CrossPassTensor': ''}bfloat16 %input331|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28953 of IO tensor {'CrossPassTensor': ''}bfloat16 %input332|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28957 of IO tensor {'CrossPassTensor': ''}bfloat16 %input334(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28954 of IO tensor {'CrossPassTensor': ''}bfloat16 %input336(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28965 of IO tensor {'CrossPassTensor': ''}bfloat16 %input337(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(127, 'AG3124'), (358, 'AG3122'), (223, 'AG3123')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28964 of IO tensor {'CrossPassTensor': ''}bfloat16 %input338|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28962 of IO tensor {'CrossPassTensor': ''}bfloat16 %input339|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28963 of IO tensor {'CrossPassTensor': ''}bfloat16 %input340|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17355 of IO tensor {'CrossPassTensor': ''}bfloat16 %input341(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(225, 'AG3133'), (49, 'AG3129'), (128, 'AG3128'), (361, 'AG3132'), (434, 'AG3131')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28973 of IO tensor {'CrossPassTensor': ''}bfloat16 %input342|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28966 of IO tensor {'CrossPassTensor': ''}bfloat16 %input343|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28970 of IO tensor {'CrossPassTensor': ''}bfloat16 %input345(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28967 of IO tensor {'CrossPassTensor': ''}bfloat16 %input347(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28978 of IO tensor {'CrossPassTensor': ''}bfloat16 %input348(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(129, 'AG3136'), (362, 'AG3134'), (226, 'AG3135')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28977 of IO tensor {'CrossPassTensor': ''}bfloat16 %input349|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28975 of IO tensor {'CrossPassTensor': ''}bfloat16 %input350|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28976 of IO tensor {'CrossPassTensor': ''}bfloat16 %input351|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17606 of IO tensor {'CrossPassTensor': ''}bfloat16 %input352(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(228, 'AG3145'), (51, 'AG3141'), (130, 'AG3140'), (365, 'AG3144'), (435, 'AG3143')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28986 of IO tensor {'CrossPassTensor': ''}bfloat16 %input353|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28979 of IO tensor {'CrossPassTensor': ''}bfloat16 %input354|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28983 of IO tensor {'CrossPassTensor': ''}bfloat16 %input356(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28980 of IO tensor {'CrossPassTensor': ''}bfloat16 %input358(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28991 of IO tensor {'CrossPassTensor': ''}bfloat16 %input359(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(131, 'AG3148'), (366, 'AG3146'), (229, 'AG3147')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28990 of IO tensor {'CrossPassTensor': ''}bfloat16 %input360|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28988 of IO tensor {'CrossPassTensor': ''}bfloat16 %input361|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28989 of IO tensor {'CrossPassTensor': ''}bfloat16 %input362|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17857 of IO tensor {'CrossPassTensor': ''}bfloat16 %input363(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(231, 'AG3157'), (53, 'AG3153'), (132, 'AG3152'), (369, 'AG3156'), (436, 'AG3155')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28999 of IO tensor {'CrossPassTensor': ''}bfloat16 %input364|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28992 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28996 of IO tensor {'CrossPassTensor': ''}bfloat16 %input367(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28993 of IO tensor {'CrossPassTensor': ''}bfloat16 %input369(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29004 of IO tensor {'CrossPassTensor': ''}bfloat16 %input370(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(133, 'AG3160'), (370, 'AG3158'), (232, 'AG3159')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29003 of IO tensor {'CrossPassTensor': ''}bfloat16 %input371|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29001 of IO tensor {'CrossPassTensor': ''}bfloat16 %input372|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29002 of IO tensor {'CrossPassTensor': ''}bfloat16 %input373|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18108 of IO tensor {'CrossPassTensor': ''}bfloat16 %input374(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(234, 'AG3169'), (55, 'AG3165'), (134, 'AG3164'), (373, 'AG3168'), (437, 'AG3167')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29012 of IO tensor {'CrossPassTensor': ''}bfloat16 %input375|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29005 of IO tensor {'CrossPassTensor': ''}bfloat16 %input376|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29009 of IO tensor {'CrossPassTensor': ''}bfloat16 %input378(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29006 of IO tensor {'CrossPassTensor': ''}bfloat16 %input380(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29017 of IO tensor {'CrossPassTensor': ''}bfloat16 %input381(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(135, 'AG3172'), (374, 'AG3170'), (235, 'AG3171')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29016 of IO tensor {'CrossPassTensor': ''}bfloat16 %input382|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29014 of IO tensor {'CrossPassTensor': ''}bfloat16 %input383|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29015 of IO tensor {'CrossPassTensor': ''}bfloat16 %input384|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18359 of IO tensor {'CrossPassTensor': ''}bfloat16 %input385(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(237, 'AG3181'), (57, 'AG3177'), (136, 'AG3176'), (377, 'AG3180'), (438, 'AG3179')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29025 of IO tensor {'CrossPassTensor': ''}bfloat16 %input386|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29018 of IO tensor {'CrossPassTensor': ''}bfloat16 %input387|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29022 of IO tensor {'CrossPassTensor': ''}bfloat16 %input389(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29019 of IO tensor {'CrossPassTensor': ''}bfloat16 %input391(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29030 of IO tensor {'CrossPassTensor': ''}bfloat16 %input392(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(137, 'AG3184'), (378, 'AG3182'), (238, 'AG3183')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29029 of IO tensor {'CrossPassTensor': ''}bfloat16 %input393|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29027 of IO tensor {'CrossPassTensor': ''}bfloat16 %input394|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29028 of IO tensor {'CrossPassTensor': ''}bfloat16 %input395|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18610 of IO tensor {'CrossPassTensor': ''}bfloat16 %input396(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(240, 'AG3193'), (59, 'AG3189'), (138, 'AG3188'), (381, 'AG3192'), (439, 'AG3191')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29038 of IO tensor {'CrossPassTensor': ''}bfloat16 %input397|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29031 of IO tensor {'CrossPassTensor': ''}bfloat16 %input398|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29035 of IO tensor {'CrossPassTensor': ''}bfloat16 %input400(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29032 of IO tensor {'CrossPassTensor': ''}bfloat16 %input402(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29043 of IO tensor {'CrossPassTensor': ''}bfloat16 %input403(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(139, 'AG3196'), (382, 'AG3194'), (241, 'AG3195')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29042 of IO tensor {'CrossPassTensor': ''}bfloat16 %input404|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29040 of IO tensor {'CrossPassTensor': ''}bfloat16 %input405|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29041 of IO tensor {'CrossPassTensor': ''}bfloat16 %input406|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18861 of IO tensor {'CrossPassTensor': ''}bfloat16 %input407(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(243, 'AG3205'), (61, 'AG3201'), (140, 'AG3200'), (385, 'AG3204'), (440, 'AG3203')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29051 of IO tensor {'CrossPassTensor': ''}bfloat16 %input408|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29044 of IO tensor {'CrossPassTensor': ''}bfloat16 %input409|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29048 of IO tensor {'CrossPassTensor': ''}bfloat16 %input411(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29045 of IO tensor {'CrossPassTensor': ''}bfloat16 %input413(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29056 of IO tensor {'CrossPassTensor': ''}bfloat16 %input414(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(141, 'AG3208'), (386, 'AG3206'), (244, 'AG3207')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29055 of IO tensor {'CrossPassTensor': ''}bfloat16 %input415|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29053 of IO tensor {'CrossPassTensor': ''}bfloat16 %input416|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29054 of IO tensor {'CrossPassTensor': ''}bfloat16 %input417|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19112 of IO tensor {'CrossPassTensor': ''}bfloat16 %input418(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(246, 'AG3217'), (63, 'AG3213'), (142, 'AG3212'), (389, 'AG3216'), (441, 'AG3215')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29064 of IO tensor {'CrossPassTensor': ''}bfloat16 %input419|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29057 of IO tensor {'CrossPassTensor': ''}bfloat16 %input420|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29061 of IO tensor {'CrossPassTensor': ''}bfloat16 %input422(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29058 of IO tensor {'CrossPassTensor': ''}bfloat16 %input424(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29069 of IO tensor {'CrossPassTensor': ''}bfloat16 %input425(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(143, 'AG3220'), (390, 'AG3218'), (247, 'AG3219')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29068 of IO tensor {'CrossPassTensor': ''}bfloat16 %input426|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29066 of IO tensor {'CrossPassTensor': ''}bfloat16 %input427|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29067 of IO tensor {'CrossPassTensor': ''}bfloat16 %input428|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19363 of IO tensor {'CrossPassTensor': ''}bfloat16 %input429(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(249, 'AG3229'), (65, 'AG3225'), (144, 'AG3224'), (393, 'AG3228'), (442, 'AG3227')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29077 of IO tensor {'CrossPassTensor': ''}bfloat16 %input430|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29070 of IO tensor {'CrossPassTensor': ''}bfloat16 %input431|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29074 of IO tensor {'CrossPassTensor': ''}bfloat16 %input433(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29071 of IO tensor {'CrossPassTensor': ''}bfloat16 %input435(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29082 of IO tensor {'CrossPassTensor': ''}bfloat16 %input436(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(145, 'AG3232'), (394, 'AG3230'), (250, 'AG3231')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29081 of IO tensor {'CrossPassTensor': ''}bfloat16 %input437|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29079 of IO tensor {'CrossPassTensor': ''}bfloat16 %input438|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29080 of IO tensor {'CrossPassTensor': ''}bfloat16 %input439|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19614 of IO tensor {'CrossPassTensor': ''}bfloat16 %input440(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(252, 'AG3241'), (67, 'AG3237'), (146, 'AG3236'), (397, 'AG3240'), (443, 'AG3239')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29090 of IO tensor {'CrossPassTensor': ''}bfloat16 %input441|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29083 of IO tensor {'CrossPassTensor': ''}bfloat16 %input442|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29087 of IO tensor {'CrossPassTensor': ''}bfloat16 %input444(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29084 of IO tensor {'CrossPassTensor': ''}bfloat16 %input446(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29095 of IO tensor {'CrossPassTensor': ''}bfloat16 %input447(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(147, 'AG3244'), (398, 'AG3242'), (253, 'AG3243')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29094 of IO tensor {'CrossPassTensor': ''}bfloat16 %input448|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29092 of IO tensor {'CrossPassTensor': ''}bfloat16 %input449|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29093 of IO tensor {'CrossPassTensor': ''}bfloat16 %input450|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19865 of IO tensor {'CrossPassTensor': ''}bfloat16 %input451(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(255, 'AG3253'), (69, 'AG3249'), (148, 'AG3248'), (401, 'AG3252'), (444, 'AG3251')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29103 of IO tensor {'CrossPassTensor': ''}bfloat16 %input452|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29096 of IO tensor {'CrossPassTensor': ''}bfloat16 %input453|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29100 of IO tensor {'CrossPassTensor': ''}bfloat16 %input455(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29097 of IO tensor {'CrossPassTensor': ''}bfloat16 %input457(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29108 of IO tensor {'CrossPassTensor': ''}bfloat16 %input458(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(149, 'AG3256'), (402, 'AG3254'), (256, 'AG3255')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29107 of IO tensor {'CrossPassTensor': ''}bfloat16 %input459|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29105 of IO tensor {'CrossPassTensor': ''}bfloat16 %input460|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29106 of IO tensor {'CrossPassTensor': ''}bfloat16 %input461|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 20116 of IO tensor {'CrossPassTensor': ''}bfloat16 %input462(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(258, 'AG3265'), (71, 'AG3261'), (150, 'AG3260'), (405, 'AG3264'), (445, 'AG3263')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29116 of IO tensor {'CrossPassTensor': ''}bfloat16 %input463|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29109 of IO tensor {'CrossPassTensor': ''}bfloat16 %input464|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29113 of IO tensor {'CrossPassTensor': ''}bfloat16 %input466(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29110 of IO tensor {'CrossPassTensor': ''}bfloat16 %input468(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29121 of IO tensor {'CrossPassTensor': ''}bfloat16 %input469(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(151, 'AG3268'), (406, 'AG3266'), (259, 'AG3267')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29120 of IO tensor {'CrossPassTensor': ''}bfloat16 %input470|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29118 of IO tensor {'CrossPassTensor': ''}bfloat16 %input471|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29119 of IO tensor {'CrossPassTensor': ''}bfloat16 %input472|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29122 of IO tensor {'CrossPassTensor': ''}bfloat16 %input474|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2844'), (265, 'AG2845')] +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 2.233 seconds +2025-08-07T13:54:55Z INFO 48500 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-08-07T13:54:56Z INFO 48500 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-08-07T13:54:56Z INFO 48500 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.252 seconds +2025-08-07T13:54:56Z INFO 48500 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-08-07T13:54:56Z INFO 48500 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-08-07T13:54:56Z INFO 48500 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.290 seconds +2025-08-07T13:54:56Z INFO 48500 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-08-07T13:54:56Z INFO 48500 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-08-07T13:54:56Z INFO 48500 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.058 seconds +2025-08-07T13:54:56Z INFO 48500 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-08-07T13:54:56Z INFO 48500 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-08-07T13:54:56Z INFO 48500 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.014 seconds +2025-08-07T13:54:56Z INFO 48500 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-08-07T13:54:58Z INFO 48500 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-08-07T13:54:58Z INFO 48500 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 2.453 seconds +2025-08-07T13:54:58Z INFO 48500 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 5.325 seconds +2025-08-07T13:54:58Z INFO 48500 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-08-07T13:54:59Z INFO 48500 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-08-07T13:54:59Z INFO 48500 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 1.035 seconds +2025-08-07T13:54:59Z INFO 48500 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-08-07T13:54:59Z INFO 48500 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-08-07T13:54:59Z INFO 48500 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.099 seconds +2025-08-07T13:54:59Z INFO 48500 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 1.145 seconds +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 44.858 seconds +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 19008: transpose_128x128 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 19008: matmul_128x128x1 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.418 seconds +2025-08-07T13:55:01Z INFO 48500 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:55:02Z INFO 48500 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:55:02Z INFO 48500 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.500 seconds +2025-08-07T13:55:02Z INFO 48500 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-08-07T13:55:03Z INFO 48500 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-08-07T13:55:03Z INFO 48500 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 1.902 seconds +2025-08-07T13:55:03Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:55:04Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:55:04Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.267 seconds +2025-08-07T13:55:04Z INFO 48500 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:55:04Z INFO 48500 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:55:04Z INFO 48500 [sg0000/Tensorizer/LICM]: LICM finished after 0.102 seconds +2025-08-07T13:55:04Z INFO 48500 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-08-07T13:55:04Z INFO 48500 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-08-07T13:55:04Z INFO 48500 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.050 seconds +2025-08-07T13:55:04Z INFO 48500 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:55:04Z INFO 48500 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:55:04Z INFO 48500 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.163 seconds +2025-08-07T13:55:04Z INFO 48500 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:55:04Z INFO 48500 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:55:04Z INFO 48500 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.139 seconds +2025-08-07T13:55:04Z INFO 48500 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 2.042 seconds +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 19008: transpose_128x128 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 19008: matmul_128x128x1 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.078 seconds +2025-08-07T13:55:06Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.322 seconds +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12250 | hlo_id: 12250 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12235 | hlo_id: 12235 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12278 | hlo_id: 12278 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12263 | hlo_id: 12263 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12306 | hlo_id: 12306 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12291 | hlo_id: 12291 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12334 | hlo_id: 12334 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12319 | hlo_id: 12319 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12362 | hlo_id: 12362 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12347 | hlo_id: 12347 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12390 | hlo_id: 12390 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12375 | hlo_id: 12375 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12418 | hlo_id: 12418 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12403 | hlo_id: 12403 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12446 | hlo_id: 12446 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12431 | hlo_id: 12431 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12474 | hlo_id: 12474 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12459 | hlo_id: 12459 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12502 | hlo_id: 12502 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12487 | hlo_id: 12487 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12530 | hlo_id: 12530 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12515 | hlo_id: 12515 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12558 | hlo_id: 12558 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12543 | hlo_id: 12543 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12586 | hlo_id: 12586 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12571 | hlo_id: 12571 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12614 | hlo_id: 12614 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12599 | hlo_id: 12599 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12642 | hlo_id: 12642 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12627 | hlo_id: 12627 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12670 | hlo_id: 12670 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12655 | hlo_id: 12655 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12698 | hlo_id: 12698 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12683 | hlo_id: 12683 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12726 | hlo_id: 12726 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12711 | hlo_id: 12711 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12754 | hlo_id: 12754 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12739 | hlo_id: 12739 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12782 | hlo_id: 12782 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12767 | hlo_id: 12767 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12810 | hlo_id: 12810 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12795 | hlo_id: 12795 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12838 | hlo_id: 12838 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12823 | hlo_id: 12823 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12866 | hlo_id: 12866 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12851 | hlo_id: 12851 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12894 | hlo_id: 12894 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12879 | hlo_id: 12879 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12922 | hlo_id: 12922 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12907 | hlo_id: 12907 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12950 | hlo_id: 12950 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12935 | hlo_id: 12935 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12978 | hlo_id: 12978 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12963 | hlo_id: 12963 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13006 | hlo_id: 13006 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12991 | hlo_id: 12991 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13034 | hlo_id: 13034 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13019 | hlo_id: 13019 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13062 | hlo_id: 13062 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13047 | hlo_id: 13047 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13090 | hlo_id: 13090 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13075 | hlo_id: 13075 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13118 | hlo_id: 13118 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13103 | hlo_id: 13103 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13146 | hlo_id: 13146 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13131 | hlo_id: 13131 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13174 | hlo_id: 13174 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13159 | hlo_id: 13159 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13202 | hlo_id: 13202 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13187 | hlo_id: 13187 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13230 | hlo_id: 13230 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13215 | hlo_id: 13215 | +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.419 seconds +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.342 seconds +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:55:07Z INFO 48500 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:55:08Z INFO 48500 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.066 seconds +2025-08-07T13:55:08Z INFO 48500 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:55:08Z INFO 48500 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:55:08Z INFO 48500 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.191 seconds +2025-08-07T13:55:08Z INFO 48500 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-08-07T13:55:08Z INFO 48500 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-08-07T13:55:08Z INFO 48500 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.067 seconds +2025-08-07T13:55:08Z INFO 48500 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-08-07T13:55:08Z INFO 48500 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-08-07T13:55:08Z INFO 48500 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.023 seconds +2025-08-07T13:55:08Z INFO 48500 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:55:08Z INFO 48500 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:55:08Z INFO 48500 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.081 seconds +2025-08-07T13:55:08Z INFO 48500 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:55:08Z INFO 48500 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:55:08Z INFO 48500 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.200 seconds +2025-08-07T13:55:08Z INFO 48500 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-08-07T13:55:09Z INFO 48500 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-08-07T13:55:09Z INFO 48500 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 1.128 seconds +2025-08-07T13:55:09Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:55:10Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:55:10Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.337 seconds +2025-08-07T13:55:10Z INFO 48500 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-08-07T13:55:10Z INFO 48500 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-08-07T13:55:10Z INFO 48500 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.232 seconds +2025-08-07T13:55:10Z INFO 48500 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:55:10Z INFO 48500 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:55:10Z INFO 48500 [sg0000/Tensorizer/LICM]: LICM finished after 0.116 seconds +2025-08-07T13:55:10Z INFO 48500 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 1.921 seconds +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.003 seconds +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.028 seconds +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.037 seconds +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=True) +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.163 seconds +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.052 seconds +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.059 seconds +2025-08-07T13:55:12Z INFO 48500 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-08-07T13:55:13Z INFO 48500 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-08-07T13:55:13Z INFO 48500 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.474 seconds +2025-08-07T13:55:13Z INFO 48500 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:55:13Z INFO 48500 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:55:13Z INFO 48500 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.051 seconds +2025-08-07T13:55:13Z INFO 48500 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:55:13Z INFO 48500 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:55:13Z INFO 48500 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.248 seconds +2025-08-07T13:55:13Z INFO 48500 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:55:13Z INFO 48500 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:55:13Z INFO 48500 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.283 seconds +2025-08-07T13:55:13Z INFO 48500 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 1.477 seconds +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.121 seconds +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.207 seconds +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.038 seconds +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.047 seconds +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.039 seconds +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.014 seconds +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.042 seconds +2025-08-07T13:55:15Z INFO 48500 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-08-07T13:55:16Z INFO 48500 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-08-07T13:55:16Z INFO 48500 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.275 seconds +2025-08-07T13:55:16Z INFO 48500 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-08-07T13:55:17Z INFO 48500 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-08-07T13:55:17Z INFO 48500 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 1.224 seconds +2025-08-07T13:55:17Z INFO 48500 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:55:17Z INFO 48500 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:55:17Z INFO 48500 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.312 seconds +2025-08-07T13:55:17Z INFO 48500 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-08-07T13:55:17Z INFO 48500 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-08-07T13:55:17Z INFO 48500 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.024 seconds +2025-08-07T13:55:17Z INFO 48500 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-08-07T13:55:17Z INFO 48500 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-08-07T13:55:17Z INFO 48500 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.318 seconds +2025-08-07T13:55:17Z INFO 48500 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:55:18Z INFO 48500 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:55:18Z INFO 48500 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.216 seconds +2025-08-07T13:55:18Z INFO 48500 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:55:18Z INFO 48500 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-08-07T13:55:18Z INFO 48500 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.426 seconds +2025-08-07T13:55:18Z INFO 48500 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:55:18Z INFO 48500 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:55:18Z INFO 48500 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.053 seconds +2025-08-07T13:55:18Z INFO 48500 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:55:19Z INFO 48500 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-08-07T13:55:19Z INFO 48500 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.483 seconds +2025-08-07T13:55:19Z INFO 48500 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-08-07T13:55:19Z INFO 48500 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-08-07T13:55:19Z INFO 48500 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.043 seconds +2025-08-07T13:55:19Z INFO 48500 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:55:19Z INFO 48500 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-08-07T13:55:19Z INFO 48500 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.377 seconds +2025-08-07T13:55:19Z INFO 48500 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:55:21Z INFO 48500 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-08-07T13:55:21Z INFO 48500 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 1.474 seconds +2025-08-07T13:55:21Z INFO 48500 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-08-07T13:55:21Z INFO 48500 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-08-07T13:55:21Z INFO 48500 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.058 seconds +2025-08-07T13:55:21Z INFO 48500 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:55:21Z INFO 48500 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-08-07T13:55:21Z INFO 48500 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.198 seconds +2025-08-07T13:55:21Z INFO 48500 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:55:21Z INFO 48500 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:55:21Z INFO 48500 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.296 seconds +2025-08-07T13:55:21Z INFO 48500 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:55:22Z INFO 48500 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-08-07T13:55:22Z INFO 48500 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 1.073 seconds +2025-08-07T13:55:22Z INFO 48500 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:55:22Z INFO 48500 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:55:22Z INFO 48500 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.059 seconds +2025-08-07T13:55:22Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:55:24Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:55:24Z INFO 48500 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 1.465 seconds +2025-08-07T13:55:24Z INFO 48500 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-08-07T13:55:24Z INFO 48500 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-08-07T13:55:24Z INFO 48500 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.167 seconds +2025-08-07T13:55:24Z INFO 48500 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-08-07T13:55:24Z INFO 48500 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-08-07T13:55:24Z INFO 48500 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.144 seconds +2025-08-07T13:55:24Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:55:24Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:55:24Z INFO 48500 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.183 seconds +2025-08-07T13:55:24Z INFO 48500 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:55:24Z INFO 48500 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:55:24Z INFO 48500 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.096 seconds +2025-08-07T13:55:24Z INFO 48500 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:55:26Z INFO 48500 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=True) +2025-08-07T13:55:26Z INFO 48500 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 1.408 seconds +2025-08-07T13:55:26Z INFO 48500 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:55:26Z INFO 48500 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:55:26Z INFO 48500 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.036 seconds +2025-08-07T13:55:26Z INFO 48500 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:55:26Z INFO 48500 [sg0000/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:55:26Z INFO 48500 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.164 seconds +2025-08-07T13:55:26Z INFO 48500 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 3.927 seconds +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.237 seconds +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.199 seconds +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.070 seconds +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 2.705ms (594.000MiB, est bw: 230.258GB/s, 7.691% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (594, 128, 4096) %'36874.52110'[i4422_0,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (75968, 4096) %'input473'[128i4422_0+i0.128,i1.4096] # id=52109, src_id=None, , instances=594 # dl = tensor_op_name: input473_pftranspose_36874 | hlo_id: 20004 | if -128i4422_0-i0.128+75967 >= 0 [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.658% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input84_local_39100'[i148_0,i147_0_0_39104,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input84'[i148_0,i147_0_0_39104,i0.128,i1.3072] # id=43192, src_id=None, , instances=64 # dl = tensor_op_name: _dot.395 | hlo_id: 15976 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.658% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input95_local_39176'[i270_0,i269_0_0_39180,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input95'[i270_0,i269_0_0_39180,i0.128,i1.3072] # id=43366, src_id=None, , instances=64 # dl = tensor_op_name: _dot.727 | hlo_id: 16091 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.658% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input106_local_39252'[i392_0,i391_0_0_39256,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input106'[i392_0,i391_0_0_39256,i0.128,i1.3072] # id=43540, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1059 | hlo_id: 16206 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.658% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input117_local_39328'[i514_0,i513_0_0_39332,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input117'[i514_0,i513_0_0_39332,i0.128,i1.3072] # id=43714, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1391 | hlo_id: 16321 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.658% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input128_local_39404'[i636_0,i635_0_0_39408,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input128'[i636_0,i635_0_0_39408,i0.128,i1.3072] # id=43888, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1723 | hlo_id: 16436 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.658% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input139_local_39480'[i758_0,i757_0_0_39484,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input139'[i758_0,i757_0_0_39484,i0.128,i1.3072] # id=44062, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2055 | hlo_id: 16551 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.658% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input150_local_39556'[i880_0,i879_0_0_39560,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input150'[i880_0,i879_0_0_39560,i0.128,i1.3072] # id=44236, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2387 | hlo_id: 16666 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.658% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input161_local_39632'[i1002_0,i1001_0_0_39636,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input161'[i1002_0,i1001_0_0_39636,i0.128,i1.3072] # id=44410, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2719 | hlo_id: 16781 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:30Z INFO 48500 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.658% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input172_local_39708'[i1124_0,i1123_0_0_39712,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input172'[i1124_0,i1123_0_0_39712,i0.128,i1.3072] # id=44584, src_id=None, , instances=64 # dl = tensor_op_name: _dot.3051 | hlo_id: 16896 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:31Z INFO 48500 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.093 seconds +2025-08-07T13:55:31Z INFO 48500 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.001 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.003 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.006 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48500 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-08-07T13:55:31Z INFO 48500 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-08-07T13:55:31Z INFO 48500 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.445 seconds +2025-08-07T13:55:31Z INFO 48500 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:55:31Z INFO 48500 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:55:31Z INFO 48500 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.499 seconds +2025-08-07T13:55:31Z INFO 48500 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-08-07T13:55:32Z WARNING 48500 [sg0000/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 91.36 percent of all matmul computation +2025-08-07T13:55:32Z INFO 48500 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-08-07T13:55:32Z INFO 48500 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.144 seconds +2025-08-07T13:55:32Z INFO 48500 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-08-07T13:55:32Z INFO 48500 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-08-07T13:55:32Z INFO 48500 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.348 seconds +2025-08-07T13:55:32Z INFO 48500 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-08-07T13:55:32Z INFO 48500 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-08-07T13:55:32Z INFO 48500 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.101 seconds +2025-08-07T13:55:32Z INFO 48500 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-08-07T13:55:32Z INFO 48500 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-08-07T13:55:32Z INFO 48500 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.251 seconds +2025-08-07T13:55:32Z INFO 48500 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-08-07T13:55:32Z INFO 48500 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-08-07T13:55:32Z INFO 48500 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-08-07T13:55:32Z INFO 48500 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-08-07T13:55:35Z INFO 48500 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-08-07T13:55:35Z INFO 48500 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 2.594 seconds +2025-08-07T13:55:37Z INFO 48500 [Tensorizer]: BirCodeGen estimate #instances=323713 in sg0000 +2025-08-07T13:55:37Z INFO 48500 [Tensorizer]: IR signature: 12d2c4dc53942d1c07cacda64cf9efadcc8685d627a34f07dbffeb7992985342 for nc00/sg0000/TensorizerBIR +2025-08-07T13:55:37Z INFO 48500 [Tensorizer]: Weights total number of bytes: 4952584 +2025-08-07T13:55:37Z INFO 48500 [Tensorizer]: Successfully built model. +2025-08-07T13:55:37Z USER 48500 [root/Tensorizer/Tensorizer]: Tensorizer finished after 104.881 seconds +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: End tensorization +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input0 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input1 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input2 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input3 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input4 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input5 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input6 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input7 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input8 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input9 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input10 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input11 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input12 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input13 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input14 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input15 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input16 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input17 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input18 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input19 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input20 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input21 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input22 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input23 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input24 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input25 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input26 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input27 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input28 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input29 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input30 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input31 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input32 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input33 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input34 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input35 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input36 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input37 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input38 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input39 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input40 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input41 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input42 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input43 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input44 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input45 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input46 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input47 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input48 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input49 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input50 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input51 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input52 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input53 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input54 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input55 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input56 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input57 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input58 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input59 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input60 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input61 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input62 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input63 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input64 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input65 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input66 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input67 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input68 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input69 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input70 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input71 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input72 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input73 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input74 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input75 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input76 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input77 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input78 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input79 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input80 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input81 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input82 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input83 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input84 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input85 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input86 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input87 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input88 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input89 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input90 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input91 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input92 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input93 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input94 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input95 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input96 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input97 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input98 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input99 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input100 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input101 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input102 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input103 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input104 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input105 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input106 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input107 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input108 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input109 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input110 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input111 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input112 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input113 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input114 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input115 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input116 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input117 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input118 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input119 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input120 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input121 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input122 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input123 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input124 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input125 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input126 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input127 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input128 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input129 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input130 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input131 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input132 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input133 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input134 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input135 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input136 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input137 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input138 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input139 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input140 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input141 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input142 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input143 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input144 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input145 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input146 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input147 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input148 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input149 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input150 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input151 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input152 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input153 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input154 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input155 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input156 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input157 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input158 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input159 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input160 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input161 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input162 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input163 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input164 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input165 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input166 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input167 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input168 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input169 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input170 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input171 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input172 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input173 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input174 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input175 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input176 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input177 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input178 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input179 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input180 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input181 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input182 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input183 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input184 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input185 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input186 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input187 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input188 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input189 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input190 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input191 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input192 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input193 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input194 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input195 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input196 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input197 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input198 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input199 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input200 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input201 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input202 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input203 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input204 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input205 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input206 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input207 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input208 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input209 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input210 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input211 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input212 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input213 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input214 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input215 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input216 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input217 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input218 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input219 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input220 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input221 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input222 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input223 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input224 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input225 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input226 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input227 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input228 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input229 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input230 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input231 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input232 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input233 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input234 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input235 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input236 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input237 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input238 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input239 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input240 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input241 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input242 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input243 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input244 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input245 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input246 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input247 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input248 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input249 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input250 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input251 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input252 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input253 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input254 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input255 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input256 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input257 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input258 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input259 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input260 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input261 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input262 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input263 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input264 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input265 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input266 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input267 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input268 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input269 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input270 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input271 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input272 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input273 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input274 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input275 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input276 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input277 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input278 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input279 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input280 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input281 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input282 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input283 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input284 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input285 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input286 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input287 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input288 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input289 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input290 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input291 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input292 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input293 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input294 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input295 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input296 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input297 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input298 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input299 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input300 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input301 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input302 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input303 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input304 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input305 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input306 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input307 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input308 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input309 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input310 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input311 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input312 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input313 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input314 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input315 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input316 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input317 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input318 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input319 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input320 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input321 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input322 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input323 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input324 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input325 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input326 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input327 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input328 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input329 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input330 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input331 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input332 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input333 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input334 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input335 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input336 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input337 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input338 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input339 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input340 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input341 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input342 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input343 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input344 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input345 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input346 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input347 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input348 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input349 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input350 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input351 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input352 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input353 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input354 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input355 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input356 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input357 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input358 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input359 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input360 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input361 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input362 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input363 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input364 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input365 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input366 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input367 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input368 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input369 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input370 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input371 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input372 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input373 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input374 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input375 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input376 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input377 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input378 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input379 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input380 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input381 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input382 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input383 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input384 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input385 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input386 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input387 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input388 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input389 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input390 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input391 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input392 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input393 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input394 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input395 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input396 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input397 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input398 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input399 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input400 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input401 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input402 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input403 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input404 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input405 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input406 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input407 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input408 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input409 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input410 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input411 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input412 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input413 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input414 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input415 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input416 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input417 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input418 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input419 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input420 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input421 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input422 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input423 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input424 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input425 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input426 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input427 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input428 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input429 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input430 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input431 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input432 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input433 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input434 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input435 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input436 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input437 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input438 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input439 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input440 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input441 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input442 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input443 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input444 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input445 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input446 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input447 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input448 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input449 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input450 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input451 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input452 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input453 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input454 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input455 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input456 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input457 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input458 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input459 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input460 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input461 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input462 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input463 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input464 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input465 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input466 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input467 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input468 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input469 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input470 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input471 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input472 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input473 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Network input: input474 +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: wrote bir.json +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: wrote tensor_map.json +2025-08-07T13:55:37Z INFO 48500 [job.Frontend.0]: Job #0 finished +2025-08-07T13:55:37Z INFO 48500 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-08-07T13:55:37Z INFO 48500 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-08-07T13:55:37Z INFO 48500 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-08-07T13:55:37Z INFO 48500 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-08-07T13:55:37Z INFO 48500 [job.WalrusDriver.0]: BackendDriver has 1 states with 1 core LNC +2025-08-07T13:55:37Z INFO 48500 [job.WalrusDriver.0]: BackendDriver: no partitions found. Switching to flat flow. +2025-08-07T13:55:37Z INFO 48500 [job.WalrusDriver.0]: Job WalrusDriver len(in_states) 1 +2025-08-07T13:55:37Z INFO 48500 [job.WalrusDriver.0]: Processing input #0 +2025-08-07T13:55:37Z INFO 48500 [job.WalrusDriver.0]: BackendDriver in_state.num_states 1 with 1 core LNC +2025-08-07T13:55:37Z INFO 48500 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/token_generation_model/_tp0_bk1/log-neuron-cc.txt --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-color-rotation --max-load-lower-bound 0.14 --mm-reorder-opt --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json --unified-backend-and-legacy-codegen --tensor-map tensor_map.json --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --dge-levels io,vector_dynamic_offsets,scalar_dynamic_offset --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.neff +2025-08-07T13:55:37Z INFO 48500 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/token_generation_model/_tp0_bk1/neuronxcc-hdngl0fs/sg00 +2025-08-07T13:55:37Z INFO 48500 [job.WalrusDriver.0]: propagate_exit=True +2025-08-07T13:55:37Z INFO 48500 [job.WalrusDriver.0]: use_logger=False +2025-08-07T13:55:37Z INFO 48500 [job.WalrusDriver.0]: expose_stderr=True +2025-08-07T13:55:37Z INFO 49811 [Logging]: Logging to ../../log-neuron-cc.txt at level 'INFO' +2025-08-07T13:55:37Z INFO 49811 [BackendDriver]: max_allowed_parallelism=128 +2025-08-07T13:55:37Z INFO 49811 [BackendDriver]: Backend driver mtBackend: false numModules: 1 Cwd: "/home/ubuntu/qwen3/token_generation_model/_tp0_bk1/neuronxcc-hdngl0fs/sg00" +2025-08-07T13:55:37Z INFO 49811 [BackendDriver]: DynamicDMA is enabled +2025-08-07T13:55:37Z INFO 49811 [BackendDriver]: DynamicDMA levels being enabled: io, scalar_dynamic_offset, vector_dynamic_offsets, +2025-08-07T13:55:37Z USER 49811 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:55:37Z INFO 49811 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=7337 blocks=1 instructions=7225 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:37Z USER 49811 [ModuleForkPass]: Running do_nothing +2025-08-07T13:55:37Z INFO 49811 [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=7337 blocks=1 instructions=7225 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:37Z USER 49811 [ModuleForkPass]: do_nothing finished after 0.001 seconds +2025-08-07T13:55:37Z INFO 49811 [ModuleForkPass]: curr_vmrss: 214mb, ru_maxrss: 694mb (delta=0mb) +2025-08-07T13:55:37Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 7337 memory location(s), 1 block(s), and 7225 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:55:37Z USER 49811 [ModuleForkPass]: Running birverifier +2025-08-07T13:55:37Z INFO 49811 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=7337 blocks=1 instructions=7225 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:37Z WARNING 49811 [birverifier::InstVisitor]: (module) Non - output memory location with no reader: {convert.345.62970}@SB<0,0>(1x2)#Internal DebugInfo: +2025-08-07T13:55:38Z USER 49811 [ModuleForkPass]: birverifier finished after 0.220 seconds +2025-08-07T13:55:38Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1006mb, ru_maxrss: 1006mb (delta=312mb) +2025-08-07T13:55:38Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 7337 memory location(s), 1 block(s), and 7225 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:55:38Z USER 49811 [BackendPassManager]: mod_parallel_pass finished after 0.226 seconds +2025-08-07T13:55:38Z INFO 49811 [BackendPassManager]: curr_vmrss: 998mb, ru_maxrss: 1006mb (delta=312mb) +2025-08-07T13:55:38Z INFO 49811 [BackendPassManager]: Output has 1 module(s), 1 function(s), 7337 memory location(s), 1 block(s), and 7225 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:55:38Z USER 49811 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:55:38Z INFO 49811 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=7337 blocks=1 instructions=7225 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:38Z USER 49811 [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:55:38Z INFO 49811 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=7337 blocks=1 instructions=7225 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:38Z USER 49811 [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:55:38Z INFO 49811 [SubgraphForkPass]: curr_vmrss: 998mb, ru_maxrss: 1006mb (delta=0mb) +2025-08-07T13:55:38Z INFO 49811 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 7337 memory location(s), 1 block(s), and 7225 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:55:38Z USER 49811 [BackendPassManager]: subgraph_parallel_pass finished after 0.002 seconds +2025-08-07T13:55:38Z INFO 49811 [BackendPassManager]: curr_vmrss: 998mb, ru_maxrss: 1006mb (delta=0mb) +2025-08-07T13:55:38Z INFO 49811 [BackendPassManager]: Output has 1 module(s), 1 function(s), 7337 memory location(s), 1 block(s), and 7225 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:55:38Z USER 49811 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:55:38Z INFO 49811 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=7337 blocks=1 instructions=7225 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:38Z USER 49811 [ModuleForkPass]: Running expand_replication +2025-08-07T13:55:38Z INFO 49811 [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=7337 blocks=1 instructions=7225 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:38Z INFO 49811 [ExpandReplication]: Found 0 replicated matmults +2025-08-07T13:55:38Z USER 49811 [ModuleForkPass]: expand_replication finished after 0.001 seconds +2025-08-07T13:55:38Z INFO 49811 [ModuleForkPass]: curr_vmrss: 998mb, ru_maxrss: 1006mb (delta=0mb) +2025-08-07T13:55:38Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 7337 memory location(s), 1 block(s), and 7225 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:55:38Z USER 49811 [ModuleForkPass]: Running unroll +2025-08-07T13:55:38Z INFO 49811 [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=7337 blocks=1 instructions=7225 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:38Z INFO 49811 [Unroll]: INFO (Unroll) Start unrolling at Thu Aug 7 13:55:38 2025 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: INFO (Unroll) DONE unrolling Thu Aug 7 13:55:38 2025 + +2025-08-07T13:55:40Z INFO 49811 [Unroll]: sg0000 Instruction count after Unroll: +2025-08-07T13:55:40Z INFO 49811 [Unroll]: Total count: 277896 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: Matmult: 252168 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: GenericCopy: 11639 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: Load: 8476 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: TensorScalarPtr: 1482 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: TensorTensor: 1125 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: Save: 682 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: Activation: 545 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: Memset: 299 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: Max: 224 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: MaxIndex: 224 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: StreamShuffle: 222 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: MatchReplace: 217 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: TensorReduce: 151 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: CollectiveCompute: 75 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: Reciprocal: 75 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: DMACopy: 74 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: Iota: 73 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: StreamTranspose: 72 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: Select: 38 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: Gather: 35 +2025-08-07T13:55:40Z INFO 49811 [Unroll]: Unrolled DGE count with Dynamic AP: 73 +2025-08-07T13:55:40Z USER 49811 [ModuleForkPass]: unroll finished after 2.726 seconds +2025-08-07T13:55:40Z INFO 49811 [ModuleForkPass]: curr_vmrss: 2389mb, ru_maxrss: 2389mb (delta=1383mb) +2025-08-07T13:55:40Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28780 memory location(s), 1 block(s), and 277896 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:40Z USER 49811 [BackendPassManager]: mod_parallel_pass finished after 2.774 seconds +2025-08-07T13:55:40Z INFO 49811 [BackendPassManager]: curr_vmrss: 1488mb, ru_maxrss: 2389mb (delta=1383mb) +2025-08-07T13:55:40Z INFO 49811 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28780 memory location(s), 1 block(s), and 277896 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:40Z USER 49811 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:55:40Z INFO 49811 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=28780 blocks=1 instructions=277896 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:40Z USER 49811 [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:55:40Z INFO 49811 [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=28780 blocks=1 instructions=277896 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:40Z INFO 49811 [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:55:41Z INFO 49811 [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:41Z INFO 49811 [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:41Z INFO 49811 [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:55:41Z USER 49811 [SubgraphForkPass]: dead_code_elim finished after 0.305 seconds +2025-08-07T13:55:41Z INFO 49811 [SubgraphForkPass]: curr_vmrss: 1495mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49811 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 28159 memory location(s), 1 block(s), and 277895 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z USER 49811 [BackendPassManager]: subgraph_parallel_pass finished after 0.310 seconds +2025-08-07T13:55:41Z INFO 49811 [BackendPassManager]: curr_vmrss: 1495mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49811 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28159 memory location(s), 1 block(s), and 277895 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z USER 49811 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:55:41Z INFO 49811 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=28159 blocks=1 instructions=277895 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z USER 49811 [ModuleForkPass]: Running birverifier +2025-08-07T13:55:41Z INFO 49811 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=28159 blocks=1 instructions=277895 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z USER 49811 [ModuleForkPass]: birverifier finished after 0.273 seconds +2025-08-07T13:55:41Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1510mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28159 memory location(s), 1 block(s), and 277895 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z USER 49811 [BackendPassManager]: mod_parallel_pass finished after 0.277 seconds +2025-08-07T13:55:41Z INFO 49811 [BackendPassManager]: curr_vmrss: 1510mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49811 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28159 memory location(s), 1 block(s), and 277895 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z USER 49811 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:55:41Z INFO 49811 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=28159 blocks=1 instructions=277895 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z USER 49811 [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:55:41Z INFO 49811 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=28159 blocks=1 instructions=277895 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z USER 49811 [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:55:41Z INFO 49811 [SubgraphForkPass]: curr_vmrss: 1510mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49811 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 28159 memory location(s), 1 block(s), and 277895 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z USER 49811 [BackendPassManager]: subgraph_parallel_pass finished after 0.003 seconds +2025-08-07T13:55:41Z INFO 49811 [BackendPassManager]: curr_vmrss: 1510mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49811 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28159 memory location(s), 1 block(s), and 277895 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z USER 49811 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:55:41Z INFO 49811 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=28159 blocks=1 instructions=277895 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z USER 49811 [ModuleForkPass]: Running instruction_reorder +2025-08-07T13:55:41Z INFO 49811 [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=28159 blocks=1 instructions=277895 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z USER 49811 [ModuleForkPass]: instruction_reorder finished after 0.046 seconds +2025-08-07T13:55:41Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1510mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28159 memory location(s), 1 block(s), and 277895 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z USER 49811 [ModuleForkPass]: Running psum_legalization +2025-08-07T13:55:41Z INFO 49811 [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=28159 blocks=1 instructions=277895 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z USER 49811 [ModuleForkPass]: psum_legalization finished after 0.024 seconds +2025-08-07T13:55:41Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1510mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28159 memory location(s), 1 block(s), and 277895 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z USER 49811 [ModuleForkPass]: Running legalize_cce_dma +2025-08-07T13:55:41Z INFO 49811 [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=28159 blocks=1 instructions=277895 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z USER 49811 [ModuleForkPass]: legalize_cce_dma finished after 0.026 seconds +2025-08-07T13:55:41Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1510mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28159 memory location(s), 1 block(s), and 277895 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z USER 49811 [ModuleForkPass]: Running error_injector +2025-08-07T13:55:41Z INFO 49811 [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=28159 blocks=1 instructions=277895 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z WARNING 49811 [ErrorInjector]: Unrecognized injected error value "0" +2025-08-07T13:55:41Z USER 49811 [ModuleForkPass]: error_injector finished after 0.001 seconds +2025-08-07T13:55:41Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1510mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28159 memory location(s), 1 block(s), and 277895 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z USER 49811 [ModuleForkPass]: Running vn_splitter +2025-08-07T13:55:41Z INFO 49811 [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=28159 blocks=1 instructions=277895 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z INFO 49811 [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 14 +2025-08-07T13:55:41Z INFO 49811 [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-08-07T13:55:41Z INFO 49811 [ShrinkDN]: INFO (ShrinkDN): Shrunk 3 nodes. Total savings 14462 bytes/partition +2025-08-07T13:55:41Z INFO 49811 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-08-07T13:55:41Z INFO 49811 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-08-07T13:55:41Z INFO 49811 [VNSplitterPass]: INFO (VNSplitter) Time: 0.001 seconds +2025-08-07T13:55:41Z INFO 49811 [VNSplitterPass]: INFO (VerticalFusion) Time: 0.033 seconds +2025-08-07T13:55:41Z INFO 49811 [VNSplitterPass]: INFO (ShrinkDN) Time: 0.046 seconds +2025-08-07T13:55:41Z USER 49811 [ModuleForkPass]: vn_splitter finished after 0.132 seconds +2025-08-07T13:55:41Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1514mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28159 memory location(s), 1 block(s), and 277895 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z USER 49811 [ModuleForkPass]: Running constant_propagate +2025-08-07T13:55:41Z INFO 49811 [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=28159 blocks=1 instructions=277895 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:41Z INFO 49811 [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-08-07T13:55:41Z INFO 49811 [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:55:41Z INFO 49811 [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:42Z INFO 49811 [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:42Z INFO 49811 [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:55:42Z INFO 49811 [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-08-07T13:55:42Z INFO 49811 [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:55:42Z INFO 49811 [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:42Z INFO 49811 [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:42Z INFO 49811 [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:55:42Z USER 49811 [ModuleForkPass]: constant_propagate finished after 0.694 seconds +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1516mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28159 memory location(s), 1 block(s), and 277895 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:42Z USER 49811 [ModuleForkPass]: Running lower_ac +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=28159 blocks=1 instructions=277895 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:42Z INFO 49811 [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-08-07T13:55:42Z USER 49811 [ModuleForkPass]: lower_ac finished after 0.042 seconds +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1516mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28159 memory location(s), 1 block(s), and 277895 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:42Z USER 49811 [ModuleForkPass]: Running input_dma_coalescing +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=28159 blocks=1 instructions=277895 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:42Z INFO 49811 [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-08-07T13:55:42Z USER 49811 [ModuleForkPass]: input_dma_coalescing finished after 0.082 seconds +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1517mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28159 memory location(s), 1 block(s), and 277895 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:42Z USER 49811 [ModuleForkPass]: Running remat_optimization +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=28159 blocks=1 instructions=277895 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:42Z INFO 49811 [RematOpt]: Removed 0 remat instructions +2025-08-07T13:55:42Z USER 49811 [ModuleForkPass]: remat_optimization finished after 0.150 seconds +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1519mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28159 memory location(s), 1 block(s), and 277895 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:42Z USER 49811 [ModuleForkPass]: Running early_peephole_opts +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=28159 blocks=1 instructions=277895 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:42Z INFO 49811 [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-08-07T13:55:42Z INFO 49811 [EarlyPeepholeOpts]: Activation Accumulate: 0 +2025-08-07T13:55:42Z USER 49811 [ModuleForkPass]: early_peephole_opts finished after 0.115 seconds +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1519mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28159 memory location(s), 1 block(s), and 277895 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:42Z USER 49811 [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=28159 blocks=1 instructions=277895 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:42Z USER 49811 [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.023 seconds +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1519mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28159 memory location(s), 1 block(s), and 277895 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:42Z USER 49811 [ModuleForkPass]: Running infer_stream_ids +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=28159 blocks=1 instructions=277895 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:42Z USER 49811 [ModuleForkPass]: infer_stream_ids finished after 0.023 seconds +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1519mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28159 memory location(s), 1 block(s), and 277895 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:42Z USER 49811 [ModuleForkPass]: Running pre_sched +2025-08-07T13:55:42Z INFO 49811 [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=28159 blocks=1 instructions=277895 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:42Z INFO 49811 [PreSched]: Start PRE scheduling 2 cores: 1 at: Thu Aug 7 13:55:42 2025 +2025-08-07T13:55:42Z INFO 49811 [LayerSpiller]: LayerSpill: Start... +2025-08-07T13:55:42Z INFO 49811 [LayerSpiller]: LayerSpill: Found 72 Splits CCs +2025-08-07T13:55:42Z INFO 49811 [LayerSpiller]: Grouped CCs to 72 clusters. +2025-08-07T13:55:42Z INFO 49811 [LayerSpiller]: LayerSpill: To Spill 60 multi-layer tensors +2025-08-07T13:55:42Z INFO 49811 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-08-07T13:55:42Z INFO 49811 [LayerSpiller]: LayerSpill: Done. +2025-08-07T13:55:42Z INFO 49811 [PreSched]: Start split live ranges Thu Aug 7 13:55:42 2025 +2025-08-07T13:55:43Z INFO 49811 [PreSched]: Num_Splits: 0 +2025-08-07T13:55:43Z INFO 49811 [PreSched]: End split live ranges Thu Aug 7 13:55:43 2025 +2025-08-07T13:55:43Z INFO 49811 [PreSched]: Strt remove redundncies Thu Aug 7 13:55:43 2025 +2025-08-07T13:55:43Z INFO 49811 [PreSched]: remove_redundant_memsets +2025-08-07T13:55:43Z INFO 49811 [PreSched]: remove_redundant_memsets: 0 +2025-08-07T13:55:43Z INFO 49811 [PreSched]: remove_redundant_loads +2025-08-07T13:55:43Z INFO 49811 [PreSched]: remove_redundant_loads: 0 +2025-08-07T13:55:43Z INFO 49811 [PreSched]: End remove redundncies Thu Aug 7 13:55:43 2025 +2025-08-07T13:55:43Z INFO 49811 [PreSched]: Start DCE Thu Aug 7 13:55:43 2025 +2025-08-07T13:55:43Z INFO 49811 [PreSched]: eliminateDeadStore removed 0 instructions +2025-08-07T13:55:43Z INFO 49811 [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:43Z INFO 49811 [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:43Z INFO 49811 [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:55:43Z INFO 49811 [PreSched]: End DCE Thu Aug 7 13:55:43 2025 +2025-08-07T13:55:43Z INFO 49811 [PreSched]: Start build flow dependencies Thu Aug 7 13:55:43 2025 +2025-08-07T13:55:43Z INFO 49811 [build_flow_deps]: Start build fdeps. Invocation: 1Thu Aug 7 13:55:43 2025 +2025-08-07T13:55:43Z INFO 49811 [build_flow_deps]: Allocs: 28279 instructions: 278015 +2025-08-07T13:55:44Z INFO 49811 [build_flow_deps]: Build fdeps inserted 818331 edges +2025-08-07T13:55:44Z INFO 49811 [build_flow_deps]: Done build fdeps 818331 Thu Aug 7 13:55:44 2025 +2025-08-07T13:55:44Z INFO 49811 [PreSched]: End build flow dependencies Thu Aug 7 13:55:44 2025 +2025-08-07T13:55:44Z INFO 49811 [PreSched]: Start remove useless insts Thu Aug 7 13:55:44 2025 +2025-08-07T13:55:44Z INFO 49811 [PreSched]: remove_useless_insts +2025-08-07T13:55:44Z INFO 49811 [PreSched]: remove Useless Instructions: 0 +2025-08-07T13:55:44Z INFO 49811 [PreSched]: End remove useless insts Thu Aug 7 13:55:44 2025 +2025-08-07T13:55:44Z INFO 49811 [PreSched]: Start scratchpad optimization Thu Aug 7 13:55:44 2025 +2025-08-07T13:55:44Z INFO 49811 [PreSched]: End scratchpad optimization Thu Aug 7 13:55:44 2025 +2025-08-07T13:55:44Z INFO 49811 [PreSched]: DONE PRE scheduling Thu Aug 7 13:55:44 2025 +2025-08-07T13:55:44Z USER 49811 [ModuleForkPass]: pre_sched finished after 1.753 seconds +2025-08-07T13:55:44Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1667mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:44Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28279 memory location(s), 1 block(s), and 278015 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:44Z USER 49811 [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:55:44Z INFO 49811 [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=28279 blocks=1 instructions=278015 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:44Z INFO 49811 [TensorCopyElim]: Tensor CP elimination: 1 +2025-08-07T13:55:44Z INFO 49811 [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:55:44Z INFO 49811 [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:45Z INFO 49811 [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:45Z INFO 49811 [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:55:45Z USER 49811 [ModuleForkPass]: tensor_copy_elim finished after 0.442 seconds +2025-08-07T13:55:45Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1667mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:45Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28278 memory location(s), 1 block(s), and 278014 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:45Z USER 49811 [ModuleForkPass]: Running dynamic_dma_setup +2025-08-07T13:55:45Z INFO 49811 [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=28278 blocks=1 instructions=278014 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:45Z USER 49811 [ModuleForkPass]: dynamic_dma_setup finished after 0.001 seconds +2025-08-07T13:55:45Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1667mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:45Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28279 memory location(s), 1 block(s), and 278014 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:45Z USER 49811 [ModuleForkPass]: Running runtime_memory_reservation +2025-08-07T13:55:45Z INFO 49811 [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=28279 blocks=1 instructions=278014 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:45Z USER 49811 [ModuleForkPass]: runtime_memory_reservation finished after 0.001 seconds +2025-08-07T13:55:45Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1667mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:45Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28279 memory location(s), 1 block(s), and 278014 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:45Z USER 49811 [ModuleForkPass]: Running coloring_allocator_psum +2025-08-07T13:55:45Z INFO 49811 [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=28279 blocks=1 instructions=278014 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:45Z INFO 49811 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:55:45Z INFO 49811 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:55:45Z INFO 49811 [PSUM_Allocator]: allocating PSUM +2025-08-07T13:55:45Z INFO 49811 [PSUM_Allocator]: main loop +2025-08-07T13:55:45Z INFO 49811 [PSUM_Allocator]: renumber locations +2025-08-07T13:55:45Z INFO 49811 [PSUM_Allocator]: size = 11778 +2025-08-07T13:55:45Z INFO 49811 [PSUM_Allocator]: build_no_bitmap start +2025-08-07T13:55:45Z INFO 49811 [PSUM_Allocator]: 100% PSUM demand before spilling +2025-08-07T13:55:45Z INFO 49811 [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-08-07T13:55:45Z INFO 49811 [PSUM_Allocator]: found 22069 edges +2025-08-07T13:55:45Z INFO 49811 [PSUM_Allocator]: mean: 3.7475 +2025-08-07T13:55:45Z INFO 49811 [PSUM_Allocator]: median: 2.27454 +2025-08-07T13:55:45Z INFO 49811 [PSUM_Allocator]: adjacency vectors require 176552 bytes +2025-08-07T13:55:45Z INFO 49811 [PSUM_Allocator]: build_no_bitmap done +2025-08-07T13:55:45Z INFO 49811 [PSUM_Allocator]: find costs +2025-08-07T13:55:50Z INFO 49811 [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-08-07T13:55:50Z INFO 49811 [PSUM_Allocator]: simplify interference graph +2025-08-07T13:55:50Z INFO 49811 [PSUM_Allocator]: initialize low and high +2025-08-07T13:55:50Z INFO 49811 [PSUM_Allocator]: lo = 11778 +2025-08-07T13:55:50Z INFO 49811 [PSUM_Allocator]: hi = 0 +2025-08-07T13:55:50Z INFO 49811 [PSUM_Allocator]: inf = 0 +2025-08-07T13:55:50Z INFO 49811 [PSUM_Allocator]: total = 11778 +2025-08-07T13:55:50Z INFO 49811 [PSUM_Allocator]: simplify +2025-08-07T13:55:50Z INFO 49811 [PSUM_Allocator]: new candidates = 0 +2025-08-07T13:55:50Z INFO 49811 [PSUM_Allocator]: select ranges +2025-08-07T13:55:50Z INFO 49811 [PSUM_Allocator]: no more spills +2025-08-07T13:55:50Z INFO 49811 [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-08-07T13:55:50Z INFO 49811 [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-08-07T13:55:50Z INFO 49811 [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-08-07T13:55:50Z USER 49811 [ModuleForkPass]: coloring_allocator_psum finished after 5.723 seconds +2025-08-07T13:55:50Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1671mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:50Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28279 memory location(s), 1 block(s), and 278014 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:50Z USER 49811 [ModuleForkPass]: Running dma_optimization_psum +2025-08-07T13:55:50Z INFO 49811 [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=28279 blocks=1 instructions=278014 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:51Z INFO 49811 [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-08-07T13:55:51Z INFO 49811 [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-08-07T13:55:51Z USER 49811 [ModuleForkPass]: dma_optimization_psum finished after 0.194 seconds +2025-08-07T13:55:51Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1671mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:51Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28279 memory location(s), 1 block(s), and 278014 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:51Z USER 49811 [ModuleForkPass]: Running address_rotation_psum +2025-08-07T13:55:51Z INFO 49811 [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=28279 blocks=1 instructions=278014 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:51Z INFO 49811 [DMAOptimizationBase]: PSUM Rotation rotated 905 PSUM Banks +2025-08-07T13:55:51Z INFO 49811 [DMAOptimizationBase]: PSUM Rotation rotated 129 PSUM Banks +2025-08-07T13:55:52Z INFO 49811 [DMAOptimizationBase]: PSUM Rotation rotated 349 PSUM Banks +2025-08-07T13:55:52Z USER 49811 [ModuleForkPass]: address_rotation_psum finished after 1.067 seconds +2025-08-07T13:55:52Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1678mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:55:52Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28279 memory location(s), 1 block(s), and 278014 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:52Z USER 49811 [ModuleForkPass]: Running coloring_allocator_sb +2025-08-07T13:55:52Z INFO 49811 [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=28279 blocks=1 instructions=278014 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:55:52Z INFO 49811 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 7592802374 +2025-08-07T13:55:52Z INFO 49811 [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 7131 bytes +2025-08-07T13:55:52Z INFO 49811 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 2812170 +2025-08-07T13:55:52Z INFO 49811 [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 397 bytes +2025-08-07T13:55:52Z INFO 49811 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 78980 +2025-08-07T13:55:52Z INFO 49811 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 136 bytes +2025-08-07T13:55:52Z INFO 49811 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:55:52Z INFO 49811 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: allocating SB +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: main loop +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: renumber locations +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: size = 15649 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: find partners +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: found 11559 accumulation groups +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: largest = _dot.11007-t42821 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: tensors = 49 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: requires 393280 bytes/partition +2025-08-07T13:55:52Z WARNING 49811 [SB_Allocator]: accumulation group is too large for SB +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: expanding partners +2025-08-07T13:55:52Z INFO 49811 []: find first defs for local +2025-08-07T13:55:52Z INFO 49811 []: find first defs for global +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: find loads +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: 1 pin count +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: 8449 remat count +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: build interference graph +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: pass 1 int-tree +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: Num intervals 15649 Num locations 15649 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: info.neighbors init Done +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: edge: 145282 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: mean: 18.5676 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: median: 10.2113 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: find costs +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: simplify interference graph +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: safe = 15145 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: unsafe = 326 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: inf = 177 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: total = 15648 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: simplify +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: simplify_step3_sorted2 #Unsafe 106 #Pinned 0 #Safe 0 minCost 0.00302294 maxCost 2.36906 locations 15649 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: new candidates = 8 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: select ranges +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: Total: 15648 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: Spilled: 0.000 (0) +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: Allocated: 1.000 (15648) +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: Rover zone: 0.968 (15143) +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: Pre-rover zone: 0.024 (373) +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: Post-rover zone: 0.008 (128) +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: Slice zone: 0.000 (4) +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: Blocks nothing: 0.047 (741) +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: Blocks medium: 0.005 (78) +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: Visited until medium blocking (mean): 0.381 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: Visited until medium blocking (median): 0.355 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: Visited until medium blocking (p95): 0.740 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: Blocks tall: 0.948 (14829) +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: Visited until tall blocking (mean): 0.899 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:55:52Z INFO 49811 [SB_Allocator]: Success +2025-08-07T13:56:17Z INFO 49811 [SB_Allocator]: SB spills = 0 tensors +2025-08-07T13:56:17Z INFO 49811 [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:56:17Z INFO 49811 [SB_Allocator]: remats = 0 tensors +2025-08-07T13:56:17Z INFO 49811 [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:56:17Z INFO 49811 [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:56:17Z INFO 49811 [SB_Allocator]: SB score = 0 +2025-08-07T13:56:17Z INFO 49811 [SB_Allocator]: spilling from SB cost about 0 cycles +2025-08-07T13:56:17Z INFO 49811 [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-08-07T13:56:17Z INFO 49811 [SB_Allocator]: pinning saved approximately 9010 cycles +2025-08-07T13:56:17Z INFO 49811 [SB_Allocator]: 0% SB utilization after allocation +2025-08-07T13:56:17Z INFO 49811 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 7592802374 +2025-08-07T13:56:17Z INFO 49811 [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 7131 bytes +2025-08-07T13:56:17Z INFO 49811 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 2812170 +2025-08-07T13:56:17Z INFO 49811 [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 397 bytes +2025-08-07T13:56:17Z INFO 49811 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 78980 +2025-08-07T13:56:17Z INFO 49811 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 136 bytes +2025-08-07T13:56:17Z USER 49811 [ModuleForkPass]: coloring_allocator_sb finished after 25.719 seconds +2025-08-07T13:56:17Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1686mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:17Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28279 memory location(s), 1 block(s), and 278014 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:17Z USER 49811 [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:56:17Z INFO 49811 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=28279 blocks=1 instructions=278014 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:18Z INFO 49811 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:56:18Z USER 49811 [ModuleForkPass]: address_rotation_sb finished after 0.395 seconds +2025-08-07T13:56:18Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1688mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:18Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28279 memory location(s), 1 block(s), and 278014 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:18Z USER 49811 [ModuleForkPass]: Running dma_optimization_sb +2025-08-07T13:56:18Z INFO 49811 [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=28279 blocks=1 instructions=278014 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:18Z INFO 49811 [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 7595614544, 99.9259% input load, 5.2662e-08% output write, 0.074128% spill/reload [sg0000] +2025-08-07T13:56:18Z INFO 49811 [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:56:18Z INFO 49811 [DMAOptimizationBase]: removed 0 identical load +2025-08-07T13:56:18Z INFO 49811 [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:56:18Z INFO 49811 [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:56:18Z INFO 49811 [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-08-07T13:56:18Z INFO 49811 [DMAOptimizationBase]: [IO to internal DMACopy Insertion]: inserted 0 DMACopy instructions +2025-08-07T13:56:18Z INFO 49811 [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-08-07T13:56:18Z INFO 49811 [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-08-07T13:56:18Z INFO 49811 [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 148, 1.94849e-06% out of total dma traffic(7.58998e+09) +2025-08-07T13:56:18Z INFO 49811 [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload instructions +2025-08-07T13:56:18Z INFO 49811 [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload memory locations +2025-08-07T13:56:18Z INFO 49811 [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-08-07T13:56:18Z INFO 49811 [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-08-07T13:56:19Z INFO 49811 [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 4100, 0.0728179% out of total spill/reload dma traffic +2025-08-07T13:56:19Z INFO 49811 [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-08-07T13:56:19Z INFO 49811 [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-08-07T13:56:19Z INFO 49811 [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:56:19Z INFO 49811 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:56:19Z INFO 49811 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:56:19Z INFO 49811 [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:56:19Z INFO 49811 [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-08-07T13:56:19Z INFO 49811 [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-08-07T13:56:19Z INFO 49811 [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-08-07T13:56:19Z INFO 49811 [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-08-07T13:56:19Z INFO 49811 [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 116 SpillSaves and Reloads +2025-08-07T13:56:19Z INFO 49811 [DMAOptimizationBase]: average loaded DMA size 7144 bytes +2025-08-07T13:56:19Z INFO 49811 [DMAOptimizationBase]: average saved DMA size 539 bytes +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 56 SpillSaves and Reloads +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: average loaded DMA size 7150 bytes +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: average saved DMA size 650 bytes +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: DMA SpillSave Coalescing Round 2 combined 0 SpillSaves and Reloads +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: average loaded DMA size 7150 bytes +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: average saved DMA size 650 bytes +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 7592800176 +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 7150 bytes +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 2810120 +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 650 bytes +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 4248, 5.5927e-05% out of total dma traffic +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 7595610296, 99.9259% input load, 5.2662e-08% output write, 0.0740741% spill/reload [sg0000] +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 7592800176 +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 7150 bytes +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 2810120 +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 650 bytes +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 78980 +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 136 bytes +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 7119 bytes +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-08-07T13:56:20Z USER 49811 [ModuleForkPass]: dma_optimization_sb finished after 2.451 seconds +2025-08-07T13:56:20Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1721mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:20Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277885 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:20Z USER 49811 [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:56:20Z INFO 49811 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=28105 blocks=1 instructions=277885 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:20Z INFO 49811 [DMAOptimizationBase]: SB Rotation rotated 359 Sb address +2025-08-07T13:56:21Z INFO 49811 [DMAOptimizationBase]: SB Rotation rotated 4549 Sb address +2025-08-07T13:56:21Z INFO 49811 [DMAOptimizationBase]: SB Rotation rotated 854 Sb address +2025-08-07T13:56:21Z INFO 49811 [DMAOptimizationBase]: SB Rotation rotated 439 Sb address +2025-08-07T13:56:22Z INFO 49811 [DMAOptimizationBase]: SB Rotation rotated 1832 Sb address +2025-08-07T13:56:22Z INFO 49811 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:56:22Z USER 49811 [ModuleForkPass]: address_rotation_sb finished after 2.016 seconds +2025-08-07T13:56:22Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1721mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:22Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277885 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:22Z USER 49811 [ModuleForkPass]: Running coloring_allocator_dram +2025-08-07T13:56:22Z INFO 49811 [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=28105 blocks=1 instructions=277885 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:22Z INFO 49811 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:56:22Z INFO 49811 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:56:22Z INFO 49811 [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-08-07T13:56:22Z INFO 49811 [DRAM_Allocator]: reserved space = 8344450336 bytes +2025-08-07T13:56:22Z INFO 49811 [DRAM_Allocator]: spill space = 3420420 bytes +2025-08-07T13:56:22Z INFO 49811 [DRAM_Allocator]: aligned spill space = 3469312 bytes +2025-08-07T13:56:22Z INFO 49811 [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:56:22Z INFO 49811 [DRAM_Allocator]: renumber locations +2025-08-07T13:56:22Z INFO 49811 [DRAM_Allocator]: size = 178 +2025-08-07T13:56:22Z INFO 49811 []: find first defs for local +2025-08-07T13:56:22Z INFO 49811 []: find first defs for global +2025-08-07T13:56:23Z INFO 49811 [DRAM_Allocator]: Num intervals 178 Num locations 178 +2025-08-07T13:56:23Z INFO 49811 [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:56:23Z INFO 49811 [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:56:23Z INFO 49811 [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:56:23Z INFO 49811 [DRAM_Allocator]: simplify interference graph +2025-08-07T13:56:23Z INFO 49811 [DRAM_Allocator]: initialize low and high +2025-08-07T13:56:23Z INFO 49811 [DRAM_Allocator]: lo = 178 +2025-08-07T13:56:23Z INFO 49811 [DRAM_Allocator]: hi = 0 +2025-08-07T13:56:23Z INFO 49811 [DRAM_Allocator]: total = 178 +2025-08-07T13:56:23Z INFO 49811 [DRAM_Allocator]: simplify +2025-08-07T13:56:23Z INFO 49811 [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:56:23Z INFO 49811 [DRAM_Allocator]: select ranges +2025-08-07T13:56:23Z INFO 49811 [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:56:23Z INFO 49811 [DRAM_Allocator]: allreduce_dram_hwm 1208320 +2025-08-07T13:56:23Z INFO 49811 [DRAM_Allocator]: Real CC buffer size 1208320 +2025-08-07T13:56:23Z INFO 49811 [DRAM_Allocator]: DRAM hwm after allocation: 3117056 +2025-08-07T13:56:23Z INFO 49811 [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:56:23Z USER 49811 [ModuleForkPass]: coloring_allocator_dram finished after 0.544 seconds +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1723mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277885 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:23Z USER 49811 [ModuleForkPass]: Running address_rotation_dram +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=28105 blocks=1 instructions=277885 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:23Z INFO 49811 [DMAOptimizationBase]: Runtime page size at 512MB +2025-08-07T13:56:23Z INFO 49811 [DMAOptimizationBase]: DRAM hwm before rotation 3117056 +2025-08-07T13:56:23Z INFO 49811 [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-08-07T13:56:23Z INFO 49811 [DMAOptimizationBase]: allreduce hwm 1208320 +2025-08-07T13:56:23Z INFO 49811 [DMAOptimizationBase]: Real CC buffer size 1208320 +2025-08-07T13:56:23Z INFO 49811 [DMAOptimizationBase]: DRAM hwm after rotation 3117056 +2025-08-07T13:56:23Z INFO 49811 [DMAOptimizationBase]: DRAM Rotation rotated 9 Dram address +2025-08-07T13:56:23Z USER 49811 [ModuleForkPass]: address_rotation_dram finished after 0.241 seconds +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1725mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277885 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:23Z USER 49811 [ModuleForkPass]: Running tensorcopy_accel +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=28105 blocks=1 instructions=277885 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:23Z INFO 49811 [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-08-07T13:56:23Z INFO 49811 [TensorCopyAccel::Impl]: Accelerated 72 out of 11936 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-08-07T13:56:23Z USER 49811 [ModuleForkPass]: tensorcopy_accel finished after 0.028 seconds +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1725mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277885 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:23Z USER 49811 [ModuleForkPass]: Running peephole_opts +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=28105 blocks=1 instructions=277885 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:23Z INFO 49811 [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-08-07T13:56:23Z USER 49811 [ModuleForkPass]: peephole_opts finished after 0.123 seconds +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1725mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277923 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:23Z USER 49811 [ModuleForkPass]: Running lower_kernel +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=28105 blocks=1 instructions=277923 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:23Z INFO 49811 [LowerKernel]: Started running LowerKernel +2025-08-07T13:56:23Z INFO 49811 [LowerKernel]: Start of kernel lowering pass, number of insts: 277923, number of allocs: 28105 +2025-08-07T13:56:23Z INFO 49811 [LowerKernel]: Scan BKs time (s): 0.023886 +2025-08-07T13:56:23Z INFO 49811 [LowerKernel]: Lower BKs time (s): 9e-06 +2025-08-07T13:56:23Z USER 49811 [ModuleForkPass]: lower_kernel finished after 0.028 seconds +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1725mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277923 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:23Z USER 49811 [ModuleForkPass]: Running lower_nki_kernel +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=28105 blocks=1 instructions=277923 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:23Z USER 49811 [ModuleForkPass]: lower_nki_kernel finished after 0.027 seconds +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1725mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277923 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:23Z USER 49811 [ModuleForkPass]: Running dynamic_dma_cleanup +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=28105 blocks=1 instructions=277923 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:23Z USER 49811 [ModuleForkPass]: dynamic_dma_cleanup finished after 0.044 seconds +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1727mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277923 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:23Z USER 49811 [ModuleForkPass]: Running birverifier +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=28105 blocks=1 instructions=277923 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:23Z USER 49811 [ModuleForkPass]: birverifier finished after 0.256 seconds +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1727mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:23Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277923 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:23Z USER 49811 [ModuleForkPass]: Running dynamic_dma_scan +2025-08-07T13:56:24Z INFO 49811 [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=28105 blocks=1 instructions=277923 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:24Z USER 49811 [ModuleForkPass]: dynamic_dma_scan finished after 0.040 seconds +2025-08-07T13:56:24Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1727mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:24Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277923 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:24Z USER 49811 [ModuleForkPass]: Running build_fdeps +2025-08-07T13:56:24Z INFO 49811 [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=28105 blocks=1 instructions=277923 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:24Z INFO 49811 [build_flow_deps]: Start build fdeps. Invocation: 2Thu Aug 7 13:56:24 2025 +2025-08-07T13:56:24Z INFO 49811 [build_flow_deps]: Allocs: 28105 instructions: 277923 +2025-08-07T13:56:24Z INFO 49811 [build_flow_deps]: Build fdeps inserted 818356 edges +2025-08-07T13:56:24Z INFO 49811 [build_flow_deps]: Done build fdeps 818356 Thu Aug 7 13:56:24 2025 +2025-08-07T13:56:24Z USER 49811 [ModuleForkPass]: build_fdeps finished after 0.747 seconds +2025-08-07T13:56:24Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1737mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:24Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277923 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:24Z USER 49811 [ModuleForkPass]: Running remove_redundancies +2025-08-07T13:56:24Z INFO 49811 [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=28105 blocks=1 instructions=277923 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:24Z INFO 49811 [RemoveRedundancies]: remove_clobbered_writes +2025-08-07T13:56:24Z INFO 49811 [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-08-07T13:56:24Z INFO 49811 [RemoveRedundancies]: remove_useless_insts +2025-08-07T13:56:24Z INFO 49811 [RemoveRedundancies]: remove Useless Instructions: 0 +2025-08-07T13:56:24Z USER 49811 [ModuleForkPass]: remove_redundancies finished after 0.128 seconds +2025-08-07T13:56:24Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1737mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:24Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277923 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:24Z USER 49811 [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:56:24Z INFO 49811 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=28105 blocks=1 instructions=277923 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:24Z INFO 49811 [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:56:24Z INFO 49811 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:56:24Z INFO 49811 [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:56:26Z USER 49811 [ModuleForkPass]: anti_dependency_analyzer finished after 1.343 seconds +2025-08-07T13:56:26Z INFO 49811 [ModuleForkPass]: curr_vmrss: 2223mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:26Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277923 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:26Z USER 49811 [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:56:26Z INFO 49811 [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=28105 blocks=1 instructions=277923 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:26Z INFO 49811 [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:56:26Z INFO 49811 [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:56:26Z USER 49811 [ModuleForkPass]: tensor_copy_elim finished after 0.466 seconds +2025-08-07T13:56:26Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1838mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:26Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277923 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:26Z USER 49811 [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-08-07T13:56:26Z INFO 49811 [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=28105 blocks=1 instructions=277923 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:26Z USER 49811 [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.001 seconds +2025-08-07T13:56:26Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1838mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:26Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277923 instruction(s). Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:26Z USER 49811 [ModuleForkPass]: Running post_sched +2025-08-07T13:56:26Z INFO 49811 [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=28105 blocks=1 instructions=277923 Max writers: 1536 Max Readers: 20251 +2025-08-07T13:56:26Z INFO 49811 [post_scheduler]: Start PosT ScheD 3 sunda Thu Aug 7 13:56:26 2025 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.336-t41850 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.383-t41861 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.668-t41880 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.715-t41891 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.1000-t41910 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.1047-t41921 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.1332-t41940 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.1379-t41951 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.1664-t41970 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.1711-t41981 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.1996-t42000 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.2043-t42011 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.2328-t42030 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.2375-t42041 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.2660-t42060 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.2707-t42071 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.2992-t42090 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.3039-t42101 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.3324-t42120 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.3371-t42131 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.3656-t42150 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.3703-t42161 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.3988-t42180 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.4035-t42191 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.4320-t42210 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.4367-t42221 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.4652-t42240 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.4699-t42251 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.4984-t42270 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.5031-t42281 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.5316-t42300 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.5363-t42311 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.5648-t42330 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.5695-t42341 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.5980-t42360 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.6027-t42371 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.6312-t42390 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.6359-t42401 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.6644-t42420 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.6691-t42431 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.6976-t42450 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.7023-t42461 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.7308-t42480 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.7355-t42491 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.7640-t42510 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.7687-t42521 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.7972-t42540 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.8019-t42551 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.8304-t42570 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.8351-t42581 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.8636-t42600 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.8683-t42611 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.8968-t42630 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.9015-t42641 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.9300-t42660 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.9347-t42671 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.9632-t42690 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.9679-t42701 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.9964-t42720 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.10011-t42731 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.10296-t42750 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.10343-t42761 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.10628-t42780 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.10675-t42791 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.10960-t42810 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.11007-t42821 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.11292-t42840 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.11339-t42851 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.11624-t42870 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.11671-t42881 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.11956-t42900 +2025-08-07T13:56:26Z WARNING 49811 [post_scheduler]: Inserted memset 0 for _dot.12003-t42911 +2025-08-07T13:56:38Z INFO 49811 [post_scheduler]: Time-aware hwm post-sched +2025-08-07T13:56:44Z INFO 49811 [post_scheduler]: Time-aware simulation time: 35002624 +2025-08-07T13:56:45Z INFO 49811 [post_scheduler]: Done PosT ScheD Thu Aug 7 13:56:45 2025 +2025-08-07T13:56:45Z USER 49811 [ModuleForkPass]: post_sched finished after 19.119 seconds +2025-08-07T13:56:45Z INFO 49811 [ModuleForkPass]: curr_vmrss: 2278mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:45Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:45Z USER 49811 [ModuleForkPass]: Running expand_scheduling_units +2025-08-07T13:56:45Z INFO 49811 [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:45Z USER 49811 [ModuleForkPass]: expand_scheduling_units finished after 0.031 seconds +2025-08-07T13:56:45Z INFO 49811 [ModuleForkPass]: curr_vmrss: 2158mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:45Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:45Z USER 49811 [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:56:45Z INFO 49811 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:49Z INFO 49811 [DMAOptimizationBase]: PSUM Rotation rotated 6323 PSUM Banks +2025-08-07T13:56:49Z INFO 49811 [DMAOptimizationBase]: PSUM Rotation rotated 7208 PSUM Banks +2025-08-07T13:56:50Z INFO 49811 [DMAOptimizationBase]: PSUM Rotation rotated 236 PSUM Banks +2025-08-07T13:56:50Z INFO 49811 [DMAOptimizationBase]: SB Rotation rotated 424 Sb address +2025-08-07T13:56:50Z INFO 49811 [DMAOptimizationBase]: SB Rotation rotated 4976 Sb address +2025-08-07T13:56:51Z INFO 49811 [DMAOptimizationBase]: SB Rotation rotated 404 Sb address +2025-08-07T13:56:51Z INFO 49811 [DMAOptimizationBase]: SB Rotation rotated 209 Sb address +2025-08-07T13:56:52Z INFO 49811 [DMAOptimizationBase]: SB Rotation rotated 201 Sb address +2025-08-07T13:56:52Z INFO 49811 [DMAOptimizationBase]: moved 0 MM forward +2025-08-07T13:56:52Z INFO 49811 [DMAOptimizationBase]: SB Rotation rotated 15 Sb address +2025-08-07T13:56:52Z INFO 49811 [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-08-07T13:56:52Z USER 49811 [ModuleForkPass]: address_rotation_sb finished after 7.046 seconds +2025-08-07T13:56:52Z INFO 49811 [ModuleForkPass]: curr_vmrss: 2181mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:53Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:53Z USER 49811 [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:56:53Z INFO 49811 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:53Z INFO 49811 [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:56:53Z INFO 49811 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:56:53Z INFO 49811 [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:56:54Z USER 49811 [ModuleForkPass]: anti_dependency_analyzer finished after 1.312 seconds +2025-08-07T13:56:54Z INFO 49811 [ModuleForkPass]: curr_vmrss: 2363mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:54Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:54Z USER 49811 [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:56:54Z INFO 49811 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:54Z INFO 49811 [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:56:54Z INFO 49811 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-08-07T13:56:54Z INFO 49811 [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:56:54Z USER 49811 [ModuleForkPass]: anti_dependency_analyzer finished after 0.362 seconds +2025-08-07T13:56:54Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1977mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:54Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:54Z USER 49811 [ModuleForkPass]: Running dep_opt +2025-08-07T13:56:54Z INFO 49811 [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:54Z INFO 49811 [build_flow_deps]: Start build fdeps. Invocation: 3Thu Aug 7 13:56:54 2025 +2025-08-07T13:56:54Z INFO 49811 [build_flow_deps]: Allocs: 28105 instructions: 277995 +2025-08-07T13:56:55Z INFO 49811 [build_flow_deps]: Build fdeps inserted 809872 edges +2025-08-07T13:56:55Z INFO 49811 [build_flow_deps]: Done build fdeps 809872 Thu Aug 7 13:56:55 2025 +2025-08-07T13:56:55Z USER 49811 [ModuleForkPass]: dep_opt finished after 1.201 seconds +2025-08-07T13:56:55Z INFO 49811 [ModuleForkPass]: curr_vmrss: 2011mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:55Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:55Z USER 49811 [ModuleForkPass]: Running report_stats +2025-08-07T13:56:55Z INFO 49811 [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:55Z INFO 49811 [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 622329856 │ +│ DMACopy │ Internal │ 1 │ 24576 │ +│ DMACopy │ Internal -> ExternalOutput │ 72 │ 75497472 │ +│ Load │ Const -> Internal │ 78 │ 2394632 │ +│ Load │ ExternalInput -> Internal │ 8268 │ 7587589280 │ +│ Load │ Internal │ 107 │ 2816264 │ +│ Save │ Internal │ 695 │ 2810116 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴────────────┘ + +2025-08-07T13:56:55Z INFO 49811 [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 75 │ +│ 4 │ 49 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 64 │ 73 │ +│ 256 │ 147 │ +│ 512 │ 954 │ +│ 1024 │ 16 │ +│ 2048 │ 30 │ +│ 4096 │ 2 │ +│ 6144 │ 2304 │ +│ 8192 │ 5493 │ +│ 60768 │ 1 │ +│ 60776 │ 4 │ +│ 262144 │ 72 │ +└─────────────────────┴───────┘ + +2025-08-07T13:56:56Z INFO 49811 [ReportStats]: MM Stats: #MatMults 252168 #MatMult-Transposes 20255 +2025-08-07T13:56:56Z INFO 49811 [ReportStats]: IO Tensor size combined: 8342039576 +2025-08-07T13:56:56Z INFO 49811 [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input473 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input76 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input85 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input106 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input96 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input84 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input98 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input109 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input107 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input95 │ ExternalInput │ bfloat16 │ 50331648 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-08-07T13:56:56Z INFO 49811 [ReportStats]: Large (Internal) Tensor Statistics: +┌────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input83_local_39036_i3 │ Internal │ bfloat16 │ 1048576 │ +│ -t69508 │ Internal │ float32 │ 1048576 │ +│ input83_local_39036_i2 │ Internal │ bfloat16 │ 1048576 │ +│ -t69513 │ Internal │ float32 │ 1048576 │ +│ -t69519 │ Internal │ float32 │ 1048576 │ +│ input83_local_39036_i0 │ Internal │ bfloat16 │ 1048576 │ +│ input83_local_39036_i5 │ Internal │ bfloat16 │ 1048576 │ +│ input83_local_39036_i4 │ Internal │ bfloat16 │ 1048576 │ +│ input83_local_39036_i1 │ Internal │ bfloat16 │ 1048576 │ +└────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-08-07T13:56:56Z USER 49811 [ModuleForkPass]: report_stats finished after 0.069 seconds +2025-08-07T13:56:56Z INFO 49811 [ModuleForkPass]: curr_vmrss: 2011mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:56Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [BackendPassManager]: mod_parallel_pass finished after 74.528 seconds +2025-08-07T13:56:56Z INFO 49811 [BackendPassManager]: curr_vmrss: 2011mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:56Z INFO 49811 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [BackendPassManager]: Running assign_trigger_engine +2025-08-07T13:56:56Z INFO 49811 [BackendPassManager]: Inputs to assign_trigger_engine: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z INFO 49811 [AssignTriggerEngine]: Assigned trigger engine for 771 DMA instructions. Moved 76 DMA instructions to CC's engines. +2025-08-07T13:56:56Z USER 49811 [BackendPassManager]: assign_trigger_engine finished after 0.113 seconds +2025-08-07T13:56:56Z INFO 49811 [BackendPassManager]: curr_vmrss: 2014mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:56Z INFO 49811 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:56:56Z INFO 49811 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [SubgraphForkPass]: Running lower_local_collectives +2025-08-07T13:56:56Z INFO 49811 [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [SubgraphForkPass]: lower_local_collectives finished after 0.001 seconds +2025-08-07T13:56:56Z INFO 49811 [SubgraphForkPass]: curr_vmrss: 2014mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:56Z INFO 49811 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [SubgraphForkPass]: Running extend_shared_lifetimes +2025-08-07T13:56:56Z INFO 49811 [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [SubgraphForkPass]: extend_shared_lifetimes finished after 0.001 seconds +2025-08-07T13:56:56Z INFO 49811 [SubgraphForkPass]: curr_vmrss: 2014mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:56Z INFO 49811 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:56:56Z INFO 49811 [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z INFO 49811 [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:56:56Z USER 49811 [SubgraphForkPass]: dead_code_elim finished after 0.189 seconds +2025-08-07T13:56:56Z INFO 49811 [SubgraphForkPass]: curr_vmrss: 2014mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:56Z INFO 49811 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [BackendPassManager]: subgraph_parallel_pass finished after 0.199 seconds +2025-08-07T13:56:56Z INFO 49811 [BackendPassManager]: curr_vmrss: 2014mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:56Z INFO 49811 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [BackendPassManager]: Running assign_hwdge_engine +2025-08-07T13:56:56Z INFO 49811 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [BackendPassManager]: assign_hwdge_engine finished after 0.032 seconds +2025-08-07T13:56:56Z INFO 49811 [BackendPassManager]: curr_vmrss: 2014mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:56Z INFO 49811 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:56:56Z INFO 49811 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [ModuleForkPass]: Running alloc_queues +2025-08-07T13:56:56Z INFO 49811 [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z INFO 49811 [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-08-07T13:56:56Z INFO 49811 [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 41 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 110 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 95 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 671 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 5 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 8300 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-08-07T13:56:56Z USER 49811 [ModuleForkPass]: alloc_queues finished after 0.033 seconds +2025-08-07T13:56:56Z INFO 49811 [ModuleForkPass]: curr_vmrss: 2014mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:56Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [ModuleForkPass]: Running chain_dma_transposes +2025-08-07T13:56:56Z INFO 49811 [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [ModuleForkPass]: chain_dma_transposes finished after 0.002 seconds +2025-08-07T13:56:56Z INFO 49811 [ModuleForkPass]: curr_vmrss: 2014mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:56Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-08-07T13:56:56Z INFO 49811 [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.001 seconds +2025-08-07T13:56:56Z INFO 49811 [ModuleForkPass]: curr_vmrss: 2014mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:56Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [ModuleForkPass]: Running lower_control +2025-08-07T13:56:56Z INFO 49811 [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z INFO 49811 [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-08-07T13:56:56Z USER 49811 [ModuleForkPass]: lower_control finished after 0.373 seconds +2025-08-07T13:56:56Z INFO 49811 [ModuleForkPass]: curr_vmrss: 2014mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:56Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [BackendPassManager]: mod_parallel_pass finished after 0.421 seconds +2025-08-07T13:56:56Z INFO 49811 [BackendPassManager]: curr_vmrss: 2014mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:56:56Z INFO 49811 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [BackendPassManager]: Running nc_parallel_pass +2025-08-07T13:56:56Z INFO 49811 [BackendPassManager]: Inputs to nc_parallel_pass: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z USER 49811 [CoreForkPass]: Running dep_reduction +2025-08-07T13:56:56Z INFO 49811 [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:56:56Z INFO 49811 [DepReduction]: Start Dependency Reduction +2025-08-07T13:56:56Z INFO 49811 [DepReduction]: Processing async instrs... +2025-08-07T13:56:56Z INFO 49811 [DepReduction]: Processing secondary edges per engine... +2025-08-07T13:56:57Z INFO 49811 [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 253782 +2025-08-07T13:56:57Z INFO 49811 [DepReduction]: Processing redundant descendants, Done. Num edges removed 263109 +2025-08-07T13:56:57Z INFO 49811 [DepReduction]: Processing async instrs, Done. Num edges removed 263109 +2025-08-07T13:57:00Z INFO 49811 [DepReduction]: Num Async removed: 0 +2025-08-07T13:57:00Z INFO 49811 [DepReduction]: Finished dependency reduction: 1904928 removed, new total 38683 +2025-08-07T13:57:00Z INFO 49811 [DepReduction]: Finished Dependency Reduction +2025-08-07T13:57:00Z USER 49811 [CoreForkPass]: dep_reduction finished after 3.850 seconds +2025-08-07T13:57:00Z INFO 49811 [CoreForkPass]: curr_vmrss: 2238mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:00Z INFO 49811 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:00Z USER 49811 [CoreForkPass]: Running lower_dynamic_dma +2025-08-07T13:57:00Z INFO 49811 [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:00Z USER 49811 [CoreForkPass]: lower_dynamic_dma finished after 0.179 seconds +2025-08-07T13:57:00Z INFO 49811 [CoreForkPass]: curr_vmrss: 2230mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:00Z INFO 49811 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:00Z USER 49811 [CoreForkPass]: Running legalize_dynamic_dma +2025-08-07T13:57:00Z INFO 49811 [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:00Z INFO 49811 [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-08-07T13:57:00Z INFO 49811 [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-08-07T13:57:00Z INFO 49811 [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-08-07T13:57:00Z USER 49811 [CoreForkPass]: legalize_dynamic_dma finished after 0.125 seconds +2025-08-07T13:57:00Z INFO 49811 [CoreForkPass]: curr_vmrss: 2229mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:00Z INFO 49811 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277995 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:00Z USER 49811 [CoreForkPass]: Running lower_dma +2025-08-07T13:57:00Z INFO 49811 [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=28105 blocks=1 instructions=277995 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:01Z INFO 49811 [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 8154/8154 (100% DGE) + power-of-2 partition : 8155/8197 (99.4876% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 8155/8197 (99.4876% DGE) + Cast (DGE/DMA) + 128 partition : 72/72 (100% DGE) + power-of-2 partition : 72/72 (100% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 72/72 (100% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 0/9 (0% DGE) + power-of-2 partition : 0/880 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/880 (0% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 1 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 72/72 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-08-07T13:57:01Z USER 49811 [CoreForkPass]: lower_dma finished after 0.231 seconds +2025-08-07T13:57:01Z INFO 49811 [CoreForkPass]: curr_vmrss: 2229mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:01Z INFO 49811 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277997 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:01Z USER 49811 [CoreForkPass]: Running coalesce_dma_blocks +2025-08-07T13:57:01Z INFO 49811 [CoreForkPass]: Inputs to coalesce_dma_blocks: modules=1 functions=1 allocs=28105 blocks=1 instructions=277997 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:01Z INFO 49811 [CoalesceDmaBlocks]: Coaleseced 57 DMA triggers +2025-08-07T13:57:01Z USER 49811 [CoreForkPass]: coalesce_dma_blocks finished after 0.144 seconds +2025-08-07T13:57:01Z INFO 49811 [CoreForkPass]: curr_vmrss: 2233mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:01Z INFO 49811 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277940 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:01Z USER 49811 [CoreForkPass]: Running expand_all_engine +2025-08-07T13:57:01Z INFO 49811 [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=28105 blocks=1 instructions=277940 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:01Z USER 49811 [CoreForkPass]: expand_all_engine finished after 0.047 seconds +2025-08-07T13:57:01Z INFO 49811 [CoreForkPass]: curr_vmrss: 2229mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:01Z INFO 49811 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277940 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:01Z USER 49811 [CoreForkPass]: Running alloc_semaphores +2025-08-07T13:57:01Z INFO 49811 [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=28105 blocks=1 instructions=277940 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:01Z USER 49811 [CoreForkPass]: alloc_semaphores finished after 0.385 seconds +2025-08-07T13:57:01Z INFO 49811 [CoreForkPass]: curr_vmrss: 2229mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:01Z INFO 49811 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277940 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:01Z USER 49811 [CoreForkPass]: Running expand_inst_late +2025-08-07T13:57:01Z INFO 49811 [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=28105 blocks=1 instructions=277940 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:02Z USER 49811 [CoreForkPass]: expand_inst_late finished after 0.493 seconds +2025-08-07T13:57:02Z INFO 49811 [CoreForkPass]: curr_vmrss: 2229mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:02Z INFO 49811 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 278015 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:02Z USER 49811 [CoreForkPass]: Running seq_inst_opt +2025-08-07T13:57:02Z INFO 49811 [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=28105 blocks=1 instructions=278015 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:02Z INFO 49811 [SeqInstOpt]: Removing 71 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:57:02Z USER 49811 [CoreForkPass]: seq_inst_opt finished after 0.034 seconds +2025-08-07T13:57:02Z INFO 49811 [CoreForkPass]: curr_vmrss: 2229mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:02Z INFO 49811 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 277944 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:02Z USER 49811 [CoreForkPass]: Running lower_sync +2025-08-07T13:57:02Z INFO 49811 [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=28105 blocks=1 instructions=277944 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:02Z USER 49811 [CoreForkPass]: lower_sync finished after 0.106 seconds +2025-08-07T13:57:02Z INFO 49811 [CoreForkPass]: curr_vmrss: 2236mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:02Z INFO 49811 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 286459 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:02Z USER 49811 [CoreForkPass]: Running lower_act +2025-08-07T13:57:02Z INFO 49811 [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=28105 blocks=1 instructions=286459 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:02Z USER 49811 [CoreForkPass]: lower_act finished after 0.060 seconds +2025-08-07T13:57:02Z INFO 49811 [CoreForkPass]: curr_vmrss: 2237mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:02Z INFO 49811 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 286738 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:02Z USER 49811 [CoreForkPass]: Running lower_dve +2025-08-07T13:57:02Z INFO 49811 [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=28105 blocks=1 instructions=286738 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:02Z INFO 49811 [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json +2025-08-07T13:57:02Z USER 49811 [CoreForkPass]: lower_dve finished after 0.468 seconds +2025-08-07T13:57:02Z INFO 49811 [CoreForkPass]: curr_vmrss: 2280mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:02Z INFO 49811 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 286738 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:02Z USER 49811 [CoreForkPass]: Running lower_ap +2025-08-07T13:57:02Z INFO 49811 [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=28105 blocks=1 instructions=286738 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:03Z USER 49811 [CoreForkPass]: lower_ap finished after 0.113 seconds +2025-08-07T13:57:03Z INFO 49811 [CoreForkPass]: curr_vmrss: 2237mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:03Z INFO 49811 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 286738 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:03Z USER 49811 [CoreForkPass]: Running coloring_allocator_reg +2025-08-07T13:57:03Z INFO 49811 [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=28105 blocks=1 instructions=286738 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:03Z INFO 49811 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:57:03Z INFO 49811 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:57:03Z INFO 49811 [REG_Allocator]: allocating REG +2025-08-07T13:57:03Z INFO 49811 [REG_Allocator]: main loop iteration 1 +2025-08-07T13:57:03Z INFO 49811 [REG_Allocator]: renumber registers +2025-08-07T13:57:03Z INFO 49811 [REG_Allocator]: size = 5 +2025-08-07T13:57:03Z INFO 49811 []: find first defs for local reg +2025-08-07T13:57:03Z INFO 49811 []: find first defs for global reg +2025-08-07T13:57:03Z INFO 49811 [REG_Allocator]: live range analysis +2025-08-07T13:57:03Z INFO 49811 [REG_Allocator]: find costs +2025-08-07T13:57:03Z INFO 49811 [REG_Allocator]: simplify interference graph +2025-08-07T13:57:03Z INFO 49811 [REG_Allocator]: initialize low and high +2025-08-07T13:57:03Z INFO 49811 [REG_Allocator]: lo = 5 +2025-08-07T13:57:03Z INFO 49811 [REG_Allocator]: hi = 0 +2025-08-07T13:57:03Z INFO 49811 [REG_Allocator]: inf = 0 +2025-08-07T13:57:03Z INFO 49811 [REG_Allocator]: total = 5 +2025-08-07T13:57:03Z INFO 49811 [REG_Allocator]: simplify +2025-08-07T13:57:03Z INFO 49811 [REG_Allocator]: new candidates = 0 +2025-08-07T13:57:03Z INFO 49811 [REG_Allocator]: select ranges +2025-08-07T13:57:03Z INFO 49811 [REG_Allocator]: no more spills +2025-08-07T13:57:03Z INFO 49811 [REG_Allocator]: REG score = 0 (lower is better) +2025-08-07T13:57:03Z INFO 49811 [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-08-07T13:57:03Z INFO 49811 [REG_Allocator]: 0% REG utilization after allocation +2025-08-07T13:57:03Z USER 49811 [CoreForkPass]: coloring_allocator_reg finished after 0.561 seconds +2025-08-07T13:57:03Z INFO 49811 [CoreForkPass]: curr_vmrss: 2282mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:03Z INFO 49811 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 286738 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:04Z USER 49811 [BackendPassManager]: nc_parallel_pass finished after 7.249 seconds +2025-08-07T13:57:04Z INFO 49811 [BackendPassManager]: curr_vmrss: 2237mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:04Z INFO 49811 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 286738 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:04Z USER 49811 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:57:04Z INFO 49811 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=28105 blocks=1 instructions=286738 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:04Z USER 49811 [ModuleForkPass]: Running birverifier +2025-08-07T13:57:04Z INFO 49811 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=28105 blocks=1 instructions=286738 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:04Z USER 49811 [ModuleForkPass]: birverifier finished after 0.263 seconds +2025-08-07T13:57:04Z INFO 49811 [ModuleForkPass]: curr_vmrss: 1984mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:04Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 286738 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:04Z USER 49811 [BackendPassManager]: mod_parallel_pass finished after 0.269 seconds +2025-08-07T13:57:04Z INFO 49811 [BackendPassManager]: curr_vmrss: 1984mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:04Z INFO 49811 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 286738 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:04Z USER 49811 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:57:04Z INFO 49811 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=28105 blocks=1 instructions=286738 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:04Z USER 49811 [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:57:04Z INFO 49811 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=28105 blocks=1 instructions=286738 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:04Z USER 49811 [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:57:04Z INFO 49811 [SubgraphForkPass]: curr_vmrss: 1984mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:04Z INFO 49811 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 286738 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:04Z USER 49811 [BackendPassManager]: subgraph_parallel_pass finished after 0.004 seconds +2025-08-07T13:57:04Z INFO 49811 [BackendPassManager]: curr_vmrss: 1984mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:04Z INFO 49811 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 286738 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:04Z USER 49811 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:57:04Z INFO 49811 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=28105 blocks=1 instructions=286738 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:04Z USER 49811 [ModuleForkPass]: Running codegen +2025-08-07T13:57:04Z INFO 49811 [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=28105 blocks=1 instructions=286738 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:04Z INFO 49811 [Codegen]: Total compiler allocated DRAM tensors: 0.00290298 GB +2025-08-07T13:57:04Z INFO 49811 [Codegen]: Total un-allocated DRAM tensors by kind: +2025-08-07T13:57:04Z INFO 49811 [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 7.69882 │ +│ ExternalOutput │ 3.72529e-09 │ +│ Const │ 0.0022452 │ +└────────────────┴─────────────┘ + +2025-08-07T13:57:04Z INFO 49811 [Codegen]: Total runtime managed DRAM tensors: 7.70106 GB +2025-08-07T13:57:05Z INFO 49811 [Codegen]: Instruction Stats: +2025-08-07T13:57:05Z INFO 49811 [Codegen]: +┌─────────────────────┬────────┐ +│ Opcode │ Count │ +├─────────────────────┼────────┤ +│ MATMUL │ 252168 │ +│ LDWEIGHTS │ 252168 │ +│ ACTIVATE │ 12721 │ +│ EVENT_SEMAPHORE │ 8515 │ +│ UNKNOWN(0xd4) │ 8300 │ +│ TENSOR_TENSOR │ 1125 │ +│ PSEUDO_DMA_TRIGGER │ 866 │ +│ MATCH_VALUE_LOAD │ 441 │ +│ MEMSET │ 369 │ +│ TENSOR_SCALAR_ADDR │ 345 │ +│ TENSOR_SCALAR │ 332 │ +│ LOAD_MASK_SELECT │ 294 │ +│ ACT_TABLE_LOAD │ 279 │ +│ CAST │ 229 │ +│ MAX8 │ 224 │ +│ FIND_INDEX8 │ 224 │ +│ STREAM_SHUFFLE │ 222 │ +│ MATCH_REPLACE8 │ 217 │ +│ UNKNOWN(0xda) │ 148 │ +│ TENSOR_REDUCE │ 115 │ +│ GATHER │ 99 │ +│ POOL_BUFFER_LOAD │ 99 │ +│ UNKNOWN(0xd9) │ 75 │ +│ RECIPROCAL │ 75 │ +│ IOTA │ 73 │ +│ COPY │ 73 │ +│ STREAM_TRANSPOSE │ 72 │ +│ UNKNOWN(0xe8) │ 38 │ +│ UNKNOWN(0x8d) │ 36 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ ALU_OP │ 2 │ +│ UNKNOWN(0xe5) │ 2 │ +│ MOVE │ 1 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +│ NOP │ 1 │ +│ RNG │ 1 │ +│ TENSOR_SCALAR │ 1 │ +└─────────────────────┴────────┘ + +2025-08-07T13:57:05Z INFO 49811 [Codegen]: +┌────────────┬────────┐ +│ Engine │ Count │ +├────────────┼────────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 13367 │ +│ Scalar │ 14654 │ +│ Tensor │ 507455 │ +│ SyncDMA │ 0 │ +│ Vector │ 4320 │ +│ Sync │ 165 │ +│ All │ 0 │ +└────────────┴────────┘ + +2025-08-07T13:57:05Z INFO 49811 [Codegen]: Total instructions: 539961 (0.0321842 GB) +2025-08-07T13:57:05Z INFO 49811 [Codegen]: Total DynamicDMA instruction count: 8300 +2025-08-07T13:57:05Z USER 49811 [Codegen]: isa_gen finished after 1.223 seconds +2025-08-07T13:57:05Z INFO 49811 [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────┼────────────────┤ +│ qActSpillReload0 │ 5932 │ +│ qDVESpillReload0 │ 264 │ +│ qPoolIO0 │ 2 │ +│ qPoolSpillReload0 │ 7308 │ +│ qSPIO0 │ 84 │ +│ qSPSpillReload0 │ 12766 │ +└───────────────────┴────────────────┘ + +Total descriptors: 26356 (0.000392735 GB) +2025-08-07T13:57:05Z INFO 49811 [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolDynamic │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +│ qPoolIO0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 112 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-08-07T13:57:05Z INFO 49811 [Codegen]: Tensors with largest descriptor count: +┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ Coalesced_memloc_Coalesced_memloc_cosine.140.56176--cosine.140.56172_48--Coalesced_memloc_cosine.140.56165--cosine.140.56161_51_111 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.140.56110--cosine.140.56106_66--Coalesced_memloc_cosine.140.56099--cosine.140.56095_69_120 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.140.56198--cosine.140.56194_42--Coalesced_memloc_cosine.140.56187--cosine.140.56183_45_108 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.140.56286--cosine.140.56282_18--Coalesced_memloc_cosine.140.56275--cosine.140.56271_21_96 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.140.56132--cosine.140.56128_60--Coalesced_memloc_cosine.140.56121--cosine.140.56117_63_117 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.140.56088--cosine.140.56084_72--Coalesced_memloc_cosine.140.56077--cosine.140.56073_75_123 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.140.56242--cosine.140.56238_30--Coalesced_memloc_cosine.140.56231--cosine.140.56227_33_102 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.140.56220--cosine.140.56216_36--Coalesced_memloc_cosine.140.56209--cosine.140.56205_39_105 │ Internal │ float32 │ 5 │ +│ input2 │ ExternalInput │ int32 │ 36 │ +│ convert.840 │ Internal │ float32 │ 599 │ +└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-08-07T13:57:05Z USER 49811 [Codegen]: dma_desc_gen finished after 0.015 seconds +2025-08-07T13:57:05Z INFO 49811 [Codegen]: Estimated peak DRAM usage: 7.73654 GB +2025-08-07T13:57:05Z INFO 49811 [Codegen]: Generating debug info +2025-08-07T13:57:06Z WARNING 49811 [Codegen]: Found 163 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-08-07T13:57:06Z USER 49811 [Codegen]: debug_info_gen finished after 0.684 seconds +2025-08-07T13:57:06Z USER 49811 [ModuleForkPass]: codegen finished after 1.974 seconds +2025-08-07T13:57:06Z INFO 49811 [ModuleForkPass]: curr_vmrss: 2217mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:06Z INFO 49811 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 286738 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:06Z USER 49811 [BackendPassManager]: mod_parallel_pass finished after 2.000 seconds +2025-08-07T13:57:06Z INFO 49811 [BackendPassManager]: curr_vmrss: 2023mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:06Z INFO 49811 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 286738 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:06Z USER 49811 [BackendPassManager]: Running neff_packager +2025-08-07T13:57:06Z INFO 49811 [BackendPassManager]: Inputs to neff_packager: modules=1 functions=1 allocs=28105 blocks=1 instructions=286738 Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:06Z WARNING 49811 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.169490.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-08-07T13:57:06Z INFO 49811 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.neff +2025-08-07T13:57:06Z INFO 49811 [NeffFileWriter]: IR signature: ba0b08f12ec019b2e21b9b19ad0c85be for neff artifacts +2025-08-07T13:57:06Z USER 49811 [BackendPassManager]: neff_packager finished after 0.349 seconds +2025-08-07T13:57:06Z INFO 49811 [BackendPassManager]: curr_vmrss: 2023mb, ru_maxrss: 2389mb (delta=0mb) +2025-08-07T13:57:06Z INFO 49811 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28105 memory location(s), 1 block(s), and 286738 instruction(s). Max writers: 1537 Max Readers: 20251 +2025-08-07T13:57:06Z INFO 49811 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ module │ Peak scratchpad usage: local │ 0.002903 GB │ +│ nc00 │ module │ Total size of allocated tensors: local │ 0.003231 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.002903 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.000000 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.000000 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.002903 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-08-07T13:57:06Z INFO 49811 [BackendDriver]: Backend completed successfully, tearing down. +2025-08-07T13:57:07Z INFO 48500 [job.WalrusDriver.0]: Job #0 finished +2025-08-07T13:57:07Z INFO 48500 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-08-07T13:57:07Z INFO 48500 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-08-07T13:57:07Z INFO 48500 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/home/ubuntu/qwen3/token_generation_model/_tp0_bk1/neuronxcc-hdngl0fs/sg00", "state_id": "sg00"}' --pipeline BIRLinker +2025-08-07T13:57:07Z INFO 48500 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/token_generation_model/_tp0_bk1/neuronxcc-hdngl0fs +2025-08-07T13:57:07Z INFO 48500 [job.BIRLinker.0]: Linking not needed. Netlist doesnt exist +2025-08-07T13:57:07Z INFO 48500 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-08-07T13:57:07Z INFO 48500 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-08-07T13:57:07Z INFO 48500 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-08-07T13:57:07Z INFO 48500 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-08-07T13:57:07Z INFO 48500 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-08-07T13:57:07Z INFO 48500 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-08-07T13:57:07Z INFO 48500 [job.NeffWrapper.0]: Processing input #0 +2025-08-07T13:57:07Z INFO 48500 [job.NeffWrapper.0]: Start NeffWrapper +2025-08-07T13:57:07Z INFO 48500 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.hlo_module.pb --neff /home/ubuntu/qwen3/token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.neff --io_transposes /home/ubuntu/qwen3/token_generation_model/_tp0_bk1/neuronxcc-hdngl0fs/io_transposes.json --output /home/ubuntu/qwen3/token_generation_model/_tp0_bk1/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/token_generation_model/_tp0_bk1/neuronxcc-hdngl0fs/hlo_netlist.json +2025-08-07T13:57:07Z INFO 48500 [job.NeffWrapper.0]: Could not open file: /home/ubuntu/qwen3/token_generation_model/_tp0_bk1/neuronxcc-hdngl0fs/hlo_netlist.json +There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-08-07T13:57:08Z INFO 48500 [job.NeffWrapper.0]: Job #0 finished +2025-08-07T13:57:08Z INFO 48500 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-08-07T13:57:08Z INFO 48500 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-08-07T13:57:08Z INFO 48500 [pipeline.Pipeline.0]: Job #0 finished +2025-08-07T13:57:08Z INFO 47983 [root]: Subcommand returned with exitcode=0 diff --git a/token_generation_model/_tp0_bk1/metaneff.pb b/token_generation_model/_tp0_bk1/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..8f053981be90d641e69c34ba4ccb8562fda0d824 --- /dev/null +++ b/token_generation_model/_tp0_bk1/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9346ad40c693ec146408e5d89f9e62b53bbd9b28ae0cfa82a8c4313c85577c89 +size 984551 diff --git a/token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.hlo_module.pb b/token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..dfcf90de6f81bc9f851e2e882c64a2dd4ddee8d8 --- /dev/null +++ b/token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c565989bf644de18fb8b4dbcf5ae03d0be2bfe8bc7c9308e7954d0a9db691fc +size 1063359 diff --git a/token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.neff b/token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.neff new file mode 100644 index 0000000000000000000000000000000000000000..382ea79cb1de59673b9eebde39a0a7d7f0a2eaa7 --- /dev/null +++ b/token_generation_model/_tp0_bk1/model.MODULE_d608453625db6ed38994+e5eecdd4.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7766ebc9549de8407cdfbe1f261eb1990584c4f800bbdd332e5825276d7e8ba9 +size 6042624 diff --git a/token_generation_model/_tp0_bk1/neuron_config.json b/token_generation_model/_tp0_bk1/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7fc79cec52df3317fbe5f62d633594e05497d326 --- /dev/null +++ b/token_generation_model/_tp0_bk1/neuron_config.json @@ -0,0 +1,220 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "Qwen/Qwen3-8B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 12288, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 36, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 1, + "bucket_n_active_tokens": false, + "buckets": [ + 256 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 1, + "chunked_prefill_config": null, + "context_encoding_buckets": null, + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": false, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 1, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": false, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 1, + "lora_config": null, + "max_batch_size": 1, + "max_context_length": 1024, + "max_length": 1024, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 1, + "n_positions": 1024, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 1024, + "pa_num_blocks": 1, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 1024, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 1, + "speculation_length": 0, + "start_rank_id": 0, + "target": null, + "tile_cc": false, + "tkg_batch_size": 1, + "token_generation_buckets": [ + 256 + ], + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 32, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/token_generation_model/_tp0_bk2/command.txt b/token_generation_model/_tp0_bk2/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d9ae1c3569a7c6f5457dcdbb015f3375be729ab --- /dev/null +++ b/token_generation_model/_tp0_bk2/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.hlo_module.pb --output model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ' --lnc=1 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/token_generation_model/_tp0_bk2/compile_flags.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.json b/token_generation_model/_tp0_bk2/compile_flags.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.json new file mode 100644 index 0000000000000000000000000000000000000000..d6bc2a446deaff19ef9fb2838877845d45e54b10 --- /dev/null +++ b/token_generation_model/_tp0_bk2/compile_flags.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ", "--lnc=1", "-O2", "--internal-hlo2tensorizer-options=--verify-hlo=true", "--logfile=/home/ubuntu/qwen3/token_generation_model/_tp0_bk2/log-neuron-cc.txt"] \ No newline at end of file diff --git a/token_generation_model/_tp0_bk2/global_metric_store.json b/token_generation_model/_tp0_bk2/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..da89eb093052a987b7eab705eaee2d0ed6c90bd5 --- /dev/null +++ b/token_generation_model/_tp0_bk2/global_metric_store.json @@ -0,0 +1,540 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 99.84916687011719, + "StaticProfiler::AveragePartitionUtilization": 99.44678497314453, + "StaticProfiler::AveragePeUtilization": 99.62279510498047, + "StaticProfiler::LocalizationEfficiency": 109.57173156738281, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 109.65839385986328, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1, + "StaticProfiler::AveragePartitionUtilization": 1, + "StaticProfiler::AveragePeUtilization": 1, + "StaticProfiler::LocalizationEfficiency": 1, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 2.1663870811462402, + "AffinePredicateResolution": 0.05438804626464844, + "AliasDependencyElimination": 0.002911806106567383, + "AliasDependencyInduction": 0.4430849552154541, + "AliasDependencyReset": 0.47305941581726074, + "BFComputeCutting": 0.055460214614868164, + "BirCodeGenLoop": 2.5789601802825928, + "CCOpFusion": 0.48283958435058594, + "CanonicalizeConv": 0.00012599999899975955, + "CanonicalizeDAGForPGTiling": 0.22054219245910645, + "CanonicalizeForTensorizer": 0.00044800000614486635, + "CanonicalizeIR": 0.07637262344360352, + "Canonicalizer": 0.007962999865412712, + "CoalesceCCOp": 0.2020435333251953, + "CommuteConcat": 0.03656482696533203, + "DMALocalityOpt": 0.036940574645996094, + "DMAProfiler": 0.09294247627258301, + "DMATilingProfiler": 0.07596397399902344, + "DataLocalityOpt": 2.0216355323791504, + "DataStreaming": 0.16262364387512207, + "DeConcat": 0.01939988136291504, + "DeadCodeElimination": 0.039392709732055664, + "DeadStoreElimination": 0.4171600341796875, + "DelinearIndices": 0.34912729263305664, + "Delinearization": 0.21215415000915527, + "DoNothing": 0.00018453598022460938, + "DramToDramTranspose": 1.0972154140472412, + "DumpGraphAndMetadata": 0.2566659450531006, + "EliminateDivs": 0.1831812858581543, + "ExpandBatchNorm": 0.06547975540161133, + "ExpandISAMacro": 0.09643840789794922, + "FactorizeBlkDims": 0.2866671085357666, + "FactorizeThreadAxesInFreeDims": 0.04702591896057129, + "FlattenMacroLoop": 0.08088397979736328, + "GenericAccessSimplifier": 0.03579258918762207, + "HoistCompute": 0.00012099999730708078, + "IdentifyCrossPassTensors": 0.00019500000053085387, + "InferInitValue": 1.2996962070465088, + "InferIntrinsicOnCC": 0.4359018802642822, + "InferNeuronTensor": 1.7843599319458008, + "InferNonlocalTensors": 4.240030765533447, + "InferPSumTensor": 1.1774413585662842, + "InlineNativeKernels": 0.0577547550201416, + "InsertIOTransposes": 1.0201120376586914, + "InsertLocalTransposes": 0.7727999687194824, + "InsertOffloadedTransposes": 0.09654521942138672, + "LICM": 0.11560368537902832, + "LateLegalizeInst": 0.33008742332458496, + "LateLegalizePostSplit": 0.10438108444213867, + "LateLowerReshapeOp": 0.04558396339416504, + "LateLowerTensorOp": 0.3634481430053711, + "LateNeuronInstComb": 0.5026288032531738, + "LayoutPreprocessing": 1.0427958965301514, + "LayoutPreprocessingAndAnalysis": 1.3883216381072998, + "LayoutRequirementAnalysis": 0.3317854404449463, + "LegalizeCCOpLayout": 0.08396220207214355, + "LegalizeOpLevelAlias": 0.038962364196777344, + "LegalizePartitionReduce": 0.08165168762207031, + "LegalizeSundaAccess": 1.4170327186584473, + "LegalizeSundaMacro": 0.4174661636352539, + "LegalizeType": 0.2041163444519043, + "LocalLayoutOpt": 0.403641939163208, + "LoopFusion": 0.3361647129058838, + "LoopSplitting": 0.01665043830871582, + "LowerBroadcast": 0.05313301086425781, + "LowerCCOpBlockAxis": 0.23895764350891113, + "LowerComplexBroadcast": 0.16195201873779297, + "LowerIntrinsics": 1.5076916217803955, + "LowerTensorOp": 0.5104155540466309, + "LowerTranspose": 0.39389753341674805, + "MacroGeneration": 2.381452798843384, + "MaskPropagation": 0.14420676231384277, + "MemcastMotion": 0.00020500000391621143, + "MemcpyElimination": 4.919696807861328, + "MutateDataType": 0.04858231544494629, + "NeuronAliasDependencyInduction": 0.026386737823486328, + "NeuronAliasDependencyReset": 0.03443121910095215, + "NeuronInstComb": 0.22007107734680176, + "NeuronLICM": 0.29350805282592773, + "NeuronLoopFusion": 0.5168290138244629, + "NeuronLoopInterchange": 0.05083131790161133, + "NeuronSimplifier": 0.35160350799560547, + "NeuronSimplifyPredicates": 0.1908702850341797, + "NeuronValueNumbering": 0.1089332103729248, + "OptimizeAliasedCopyChain": 0.01871037483215332, + "OptimizeNKIKernels": 0.40648579597473145, + "PAGLayoutOpt": 29.425771713256836, + "PComputeCutting": 0.31726980209350586, + "PGLayoutTilingPipeline": 43.33420944213867, + "PGTiling": 5.205065727233887, + "PadElimination": 0.008579492568969727, + "ParAxesAnnotation": 28.64370346069336, + "PartialLoopFusion": 0.34680724143981934, + "PartialSimdFusion": 0.3020198345184326, + "PenguinizeFunctions": 0.00023999999393709004, + "PerfectLoopNest": 0.06528711318969727, + "PruneFunctions": 0.0006179999909363687, + "RecognizeOpIdiom": 0.20986127853393555, + "Recompute": 0.007412910461425781, + "RelaxPredicates": 0.16043448448181152, + "Rematerialization": 0.16845178604125977, + "RemoveOptimizationBarriers": 0.00029799999902024865, + "ReshapeWeights": 0.02247476577758789, + "ResolveAccessConflict": 0.2557854652404785, + "ResolveComplicatePredicates": 0.05318570137023926, + "RewriteReplicationMatmul": 0.047820329666137695, + "RewriteWeights": 0.0624082088470459, + "SFKVectorizer": 4.161271095275879, + "ScatterMotion": 0.005237000063061714, + "SimpleAllReduceTiling": 0.06981420516967773, + "Simplifier": 0.12597966194152832, + "SimplifyMacroPredicates": 0.19452714920043945, + "SimplifyNeuronTensor": 1.3746368885040283, + "SimplifySlice": 0.03652215003967285, + "SimplifyTensor": 0.23181509971618652, + "SpillPSum": 0.38764262199401855, + "SplitAPUnionSets": 0.36852145195007324, + "SplitAccGrp": 0.04212141036987305, + "StaticProfiler": 0.1433553695678711, + "StaticTransposeLocalTensor": 0.24139881134033203, + "SundaISel": 1.6638131141662598, + "TCTransform": 0.03867149353027344, + "TensorInitialization": 0.1416149139404297, + "TensorOpSimplifier": 0.46573352813720703, + "TensorOpTransform": 1.5342040061950684, + "TensorizerLegalizationPass": 0.00018200000340584666, + "TileCCOps": 0.22609496116638184, + "TilingProfiler": 0.3983957767486572, + "TransformConvOp": 0.07304072380065918, + "TritiumFusion": 1.2241952419281006, + "ValueNumbering": 0.10742712020874023, + "VectorizeDMA": 0.03832292556762695, + "VectorizeMatMult": 0.02480769157409668, + "VerifySupportedOps": 0.0002899999963119626, + "WeightCoalescing": 0.06044602394104004, + "ZeroSizeTensorElimination": 0.0016129016876220703, + "algsimp": 0.002188999904319644, + "batchnorm_expander": 0.001088000019080937, + "boundary-marker-removal": 0.0006409999914467335, + "call-inliner": 0.0003389999910723418, + "canonicalize-boundary-marker": 0.0006350000039674342, + "collective-stream-id-checker": 8.399999933317304e-05, + "comparison-expander": 0.0005649999948218465, + "computation-deduplicator": 0.000691999972332269, + "conditional-to-select": 0.00011000000085914508, + "config-lowering": 0.0003009999927598983, + "constant_folding": 0.00023999999393709004, + "cse": 0.0006019999855197966, + "dce": 5.700000110664405e-05, + "dynamic-slice-transpose": 0.00022600000374950469, + "eliminate-redundant-compare": 0.0002589999930933118, + "emit-offloaded-dropout": 0.0003549999964889139, + "flatten-call-graph": 0.0004839999892283231, + "fuse-send-recv": 0.002319999970495701, + "hilo::LegalizeAlias": 0.003916999790817499, + "hilo::NeuronInstCombine": 0.0012929999502375722, + "hilo::NeuronOpFusion": 0.00032900000223889947, + "hilo::ReplaceTokenTypeWithU8Pass": 0.000311999989207834, + "hilo::ScheduleFusion": 4.600000102072954e-05, + "hilo::SixtyFourHack": 0.0003600000054575503, + "hilo::VerifyAliasing": 8.70000003487803e-05, + "hlo-mac-count": 0.000826000003144145, + "hlo-verifier": 0.008930999785661697, + "io-con-pipe-begin": 9.999999747378752e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.001310999970883131, + "legalize-ccops": 2.499999936844688e-05, + "legalize-compare": 0.0004949999856762588, + "lower-argminmax-custom-call": 0.00020799999765586108, + "map-inline": 0.0009640000062063336, + "metadata-naming": 0.0018500000005587935, + "mlir::detail::OpToOpPassAdaptor": 0.0002519999979995191, + "mlir::hlo::MhloToPyPenguin": 0.04394499957561493, + "mlir::mhlo::LowerComplexExtraPass": 0.003198999911546707, + "mlir::mhlo::LowerComplexPass": 0.002833999926224351, + "native-to-custom-softmax": 0.0005339999916031957, + "native-to-custom-softmax-dx": 0.0005579999997280538, + "operand_upcaster": 0.0009950000094249845, + "post-par-pipe-begin": 1.9999999949504854e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0675479993224144, + "pre-hlo-begin": 7.000000096013537e-06, + "pre-hlo-end": 1.9999999949504854e-06, + "replace-minimum-constant": 0.0002770000137388706, + "reshape-mover": 9.100000170292333e-05, + "simplify-concat": 0.0023449999280273914, + "simplify-while-loops": 9.699999645818025e-05, + "transform-variadic-reduce": 0.00101200002245605, + "tuple-simplifier": 0.0002390000008745119, + "unpack-nested-aws-ntwsr": 0.0005499999970197678, + "unroll-while-loop": 1.8000000636675395e-05 + }, + "hilo": { + "HloMacCount": 3859619840.0, + "Traffic": 8267155968.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 264517, + "StaticProfiler::AifUb": 10.796680450439453, + "StaticProfiler::ArithmeticIntensityTensorizer": 11.830109596252441, + "StaticProfiler::AverageDmaLength": 6520.51171875, + "StaticProfiler::DDRTransferBytes": 7615514968, + "StaticProfiler::InternalTransferBytes": 647367700, + "StaticProfiler::LoadExpanded": 1061059, + "StaticProfiler::StoreExpanded": 3422, + "StaticProfiler::TotalDMAExpanded": 1064481, + "StaticProfiler::TotalDynamicInstancesCount": 277427, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 276654, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 79, + "TilingProfiler::MatMultInstructionsAfterTiling": 232272, + "TilingProfiler::NumPfTransposes": 327, + "TilingProfiler::NumPfTransposesForIo": 38, + "TilingProfiler::NumPfTransposesForLocal": 144, + "TilingProfiler::NumPfTransposesForNonlocal": 145, + "TilingProfiler::PfTransposeInstructions": 19982, + "TilingProfiler::PfTransposeInstructionsForIo": 19585, + "TilingProfiler::PfTransposeInstructionsForLocal": 252, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 145, + "TilingProfiler::ReduceInstructionsAfterTiling": 182, + "TilingProfiler::SimdInstructionsAfterTiling": 3323, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + }, + "all": { + "compiletime": { + "CanonicalizeConv": 0.00012599999899975955, + "CanonicalizeForTensorizer": 0.00044800000614486635, + "Canonicalizer": 0.007962999865412712, + "HoistCompute": 0.00012099999730708078, + "IdentifyCrossPassTensors": 0.00019500000053085387, + "MemcastMotion": 0.00020500000391621143, + "PenguinizeFunctions": 0.00023999999393709004, + "PruneFunctions": 0.0006179999909363687, + "RemoveOptimizationBarriers": 0.00029799999902024865, + "ScatterMotion": 0.005237000063061714, + "TensorizerLegalizationPass": 0.00018200000340584666, + "VerifySupportedOps": 0.0002899999963119626, + "algsimp": 0.002188999904319644, + "batchnorm_expander": 0.001088000019080937, + "boundary-marker-removal": 0.0006409999914467335, + "call-inliner": 0.0003389999910723418, + "canonicalize-boundary-marker": 0.0006350000039674342, + "collective-stream-id-checker": 8.399999933317304e-05, + "comparison-expander": 0.0005649999948218465, + "computation-deduplicator": 0.000691999972332269, + "conditional-to-select": 0.00011000000085914508, + "config-lowering": 0.0003009999927598983, + "constant_folding": 0.00023999999393709004, + "cse": 0.0006019999855197966, + "dce": 5.700000110664405e-05, + "dynamic-slice-transpose": 0.00022600000374950469, + "eliminate-redundant-compare": 0.0002589999930933118, + "emit-offloaded-dropout": 0.0003549999964889139, + "flatten-call-graph": 0.0004839999892283231, + "fuse-send-recv": 0.002319999970495701, + "hilo::LegalizeAlias": 0.003916999790817499, + "hilo::NeuronInstCombine": 0.0012929999502375722, + "hilo::NeuronOpFusion": 0.00032900000223889947, + "hilo::ReplaceTokenTypeWithU8Pass": 0.000311999989207834, + "hilo::ScheduleFusion": 4.600000102072954e-05, + "hilo::SixtyFourHack": 0.0003600000054575503, + "hilo::VerifyAliasing": 8.70000003487803e-05, + "hlo-mac-count": 0.000826000003144145, + "hlo-verifier": 0.008930999785661697, + "io-con-pipe-begin": 9.999999747378752e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.001310999970883131, + "legalize-ccops": 2.499999936844688e-05, + "legalize-compare": 0.0004949999856762588, + "lower-argminmax-custom-call": 0.00020799999765586108, + "map-inline": 0.0009640000062063336, + "metadata-naming": 0.0018500000005587935, + "mlir::detail::OpToOpPassAdaptor": 0.0002519999979995191, + "mlir::hlo::MhloToPyPenguin": 0.04394499957561493, + "mlir::mhlo::LowerComplexExtraPass": 0.003198999911546707, + "mlir::mhlo::LowerComplexPass": 0.002833999926224351, + "native-to-custom-softmax": 0.0005339999916031957, + "native-to-custom-softmax-dx": 0.0005579999997280538, + "operand_upcaster": 0.0009950000094249845, + "post-par-pipe-begin": 1.9999999949504854e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0675479993224144, + "pre-hlo-begin": 7.000000096013537e-06, + "pre-hlo-end": 1.9999999949504854e-06, + "replace-minimum-constant": 0.0002770000137388706, + "reshape-mover": 9.100000170292333e-05, + "simplify-concat": 0.0023449999280273914, + "simplify-while-loops": 9.699999645818025e-05, + "transform-variadic-reduce": 0.00101200002245605, + "tuple-simplifier": 0.0002390000008745119, + "unpack-nested-aws-ntwsr": 0.0005499999970197678, + "unroll-while-loop": 1.8000000636675395e-05 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.0001995563507080078, + "DMALocalityOpt": 0.00015997886657714844, + "DMAProfiler": 0.0007915496826171875, + "DataStreaming": 0.0002789497375488281, + "DoNothing": 0.00011467933654785156, + "ExpandISAMacro": 0.0005340576171875, + "FactorizeBlkDims": 0.00045871734619140625, + "InferPSumTensor": 0.00047516822814941406, + "LateLegalizeInst": 0.000408172607421875, + "LateNeuronInstComb": 0.0004611015319824219, + "LegalizeSundaAccess": 0.001535654067993164, + "LegalizeType": 0.00022554397583007813, + "LowerBroadcast": 0.00020837783813476563, + "LowerIntrinsics": 0.0002033710479736328, + "LowerTranspose": 0.0002181529998779297, + "NeuronInstComb": 0.0004856586456298828, + "NeuronLICM": 0.00040912628173828125, + "NeuronSimplifyPredicates": 0.002805471420288086, + "NeuronValueNumbering": 0.0004024505615234375, + "SFKVectorizer": 0.0027663707733154297, + "SimpleAllReduceTiling": 0.00021505355834960938, + "SimplifyNeuronTensor": 0.0004303455352783203, + "SpillPSum": 0.0004892349243164063, + "WeightCoalescing": 0.0002009868621826172 + } + }, + "sg00": { + "hilo": { + "ArithmeticIntensity": 0.9337236881256104, + "HloMacCount": 3859619840.0, + "Traffic": 8267155968.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 2.1663870811462402, + "AffinePredicateResolution": 0.05438804626464844, + "AliasDependencyElimination": 0.002911806106567383, + "AliasDependencyInduction": 0.4430849552154541, + "AliasDependencyReset": 0.47305941581726074, + "BFComputeCutting": 0.055460214614868164, + "BirCodeGenLoop": 2.5789601802825928, + "CCOpFusion": 0.48283958435058594, + "CanonicalizeDAGForPGTiling": 0.22054219245910645, + "CanonicalizeIR": 0.07637262344360352, + "CoalesceCCOp": 0.2018439769744873, + "CommuteConcat": 0.03656482696533203, + "DMALocalityOpt": 0.036780595779418945, + "DMAProfiler": 0.09215092658996582, + "DMATilingProfiler": 0.07596397399902344, + "DataLocalityOpt": 2.0216355323791504, + "DataStreaming": 0.16234469413757324, + "DeConcat": 0.01939988136291504, + "DeadCodeElimination": 0.039392709732055664, + "DeadStoreElimination": 0.4171600341796875, + "DelinearIndices": 0.34912729263305664, + "Delinearization": 0.21215415000915527, + "DoNothing": 6.985664367675781e-05, + "DramToDramTranspose": 1.0972154140472412, + "DumpGraphAndMetadata": 0.2566659450531006, + "EliminateDivs": 0.1831812858581543, + "ExpandBatchNorm": 0.06547975540161133, + "ExpandISAMacro": 0.09590435028076172, + "FactorizeBlkDims": 0.2862083911895752, + "FactorizeThreadAxesInFreeDims": 0.04702591896057129, + "FlattenMacroLoop": 0.08088397979736328, + "GenericAccessSimplifier": 0.03579258918762207, + "InferInitValue": 1.2996962070465088, + "InferIntrinsicOnCC": 0.4359018802642822, + "InferNeuronTensor": 1.7843599319458008, + "InferNonlocalTensors": 4.240030765533447, + "InferPSumTensor": 1.1769661903381348, + "InlineNativeKernels": 0.0577547550201416, + "InsertIOTransposes": 1.0201120376586914, + "InsertLocalTransposes": 0.7727999687194824, + "InsertOffloadedTransposes": 0.09654521942138672, + "LICM": 0.11560368537902832, + "LateLegalizeInst": 0.3296792507171631, + "LateLegalizePostSplit": 0.10438108444213867, + "LateLowerReshapeOp": 0.04558396339416504, + "LateLowerTensorOp": 0.3634481430053711, + "LateNeuronInstComb": 0.5021677017211914, + "LayoutPreprocessing": 1.0427958965301514, + "LayoutPreprocessingAndAnalysis": 1.3883216381072998, + "LayoutRequirementAnalysis": 0.3317854404449463, + "LegalizeCCOpLayout": 0.08396220207214355, + "LegalizeOpLevelAlias": 0.038962364196777344, + "LegalizePartitionReduce": 0.08165168762207031, + "LegalizeSundaAccess": 1.415497064590454, + "LegalizeSundaMacro": 0.4174661636352539, + "LegalizeType": 0.20389080047607422, + "LocalLayoutOpt": 0.403641939163208, + "LoopFusion": 0.3361647129058838, + "LoopSplitting": 0.01665043830871582, + "LowerBroadcast": 0.05292463302612305, + "LowerCCOpBlockAxis": 0.23895764350891113, + "LowerComplexBroadcast": 0.16195201873779297, + "LowerIntrinsics": 1.5074882507324219, + "LowerTensorOp": 0.5104155540466309, + "LowerTranspose": 0.3936793804168701, + "MacroGeneration": 2.381452798843384, + "MaskPropagation": 0.14420676231384277, + "MemcpyElimination": 4.919696807861328, + "MutateDataType": 0.04858231544494629, + "NeuronAliasDependencyInduction": 0.026386737823486328, + "NeuronAliasDependencyReset": 0.03443121910095215, + "NeuronInstComb": 0.21958541870117188, + "NeuronLICM": 0.29309892654418945, + "NeuronLoopFusion": 0.5168290138244629, + "NeuronLoopInterchange": 0.05083131790161133, + "NeuronSimplifier": 0.35160350799560547, + "NeuronSimplifyPredicates": 0.1880648136138916, + "NeuronValueNumbering": 0.10853075981140137, + "OptimizeAliasedCopyChain": 0.01871037483215332, + "OptimizeNKIKernels": 0.40648579597473145, + "PAGLayoutOpt": 29.425771713256836, + "PComputeCutting": 0.31726980209350586, + "PGLayoutTilingPipeline": 43.33420944213867, + "PGTiling": 5.205065727233887, + "PadElimination": 0.008579492568969727, + "ParAxesAnnotation": 28.64370346069336, + "PartialLoopFusion": 0.34680724143981934, + "PartialSimdFusion": 0.3020198345184326, + "PerfectLoopNest": 0.06528711318969727, + "RecognizeOpIdiom": 0.20986127853393555, + "Recompute": 0.007412910461425781, + "RelaxPredicates": 0.16043448448181152, + "Rematerialization": 0.16845178604125977, + "ReshapeWeights": 0.02247476577758789, + "ResolveAccessConflict": 0.2557854652404785, + "ResolveComplicatePredicates": 0.05318570137023926, + "RewriteReplicationMatmul": 0.047820329666137695, + "RewriteWeights": 0.0624082088470459, + "SFKVectorizer": 4.158504962921143, + "SimpleAllReduceTiling": 0.06959915161132813, + "Simplifier": 0.12597966194152832, + "SimplifyMacroPredicates": 0.19452714920043945, + "SimplifyNeuronTensor": 1.37420654296875, + "SimplifySlice": 0.03652215003967285, + "SimplifyTensor": 0.23181509971618652, + "SpillPSum": 0.38715338706970215, + "SplitAPUnionSets": 0.36852145195007324, + "SplitAccGrp": 0.04212141036987305, + "StaticProfiler": 0.1433553695678711, + "StaticTransposeLocalTensor": 0.24139881134033203, + "SundaISel": 1.6638131141662598, + "TCTransform": 0.03867149353027344, + "TensorInitialization": 0.1416149139404297, + "TensorOpSimplifier": 0.46573352813720703, + "TensorOpTransform": 1.5342040061950684, + "TileCCOps": 0.22609496116638184, + "TilingProfiler": 0.3983957767486572, + "TransformConvOp": 0.07304072380065918, + "TritiumFusion": 1.2241952419281006, + "ValueNumbering": 0.10742712020874023, + "VectorizeDMA": 0.03832292556762695, + "VectorizeMatMult": 0.02480769157409668, + "WeightCoalescing": 0.06024503707885742, + "ZeroSizeTensorElimination": 0.0016129016876220703 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 264517, + "StaticProfiler::AifUb": 10.796680450439453, + "StaticProfiler::ArithmeticIntensityTensorizer": 11.830109596252441, + "StaticProfiler::AverageDmaLength": 6520.51171875, + "StaticProfiler::AverageFractalPeUtilization": 99.84916687011719, + "StaticProfiler::AveragePartitionUtilization": 99.44678497314453, + "StaticProfiler::AveragePeUtilization": 99.62279510498047, + "StaticProfiler::DDRTransferBytes": 7615514968, + "StaticProfiler::InternalTransferBytes": 647367700, + "StaticProfiler::LoadExpanded": 1061059, + "StaticProfiler::LocalizationEfficiency": 109.57173156738281, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 109.65839385986328, + "StaticProfiler::StoreExpanded": 3422, + "StaticProfiler::TotalDMAExpanded": 1064481, + "StaticProfiler::TotalDynamicInstancesCount": 277427, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 276654, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 79, + "TilingProfiler::MatMultInstructionsAfterTiling": 232272, + "TilingProfiler::NumPfTransposes": 327, + "TilingProfiler::NumPfTransposesForIo": 38, + "TilingProfiler::NumPfTransposesForLocal": 144, + "TilingProfiler::NumPfTransposesForNonlocal": 145, + "TilingProfiler::PfTransposeInstructions": 19982, + "TilingProfiler::PfTransposeInstructionsForIo": 19585, + "TilingProfiler::PfTransposeInstructionsForLocal": 252, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 145, + "TilingProfiler::ReduceInstructionsAfterTiling": 182, + "TilingProfiler::SimdInstructionsAfterTiling": 3323, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + } +} \ No newline at end of file diff --git a/token_generation_model/_tp0_bk2/graph.neff b/token_generation_model/_tp0_bk2/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..65f817e2c21386e47706553c4e4d419737c40300 --- /dev/null +++ b/token_generation_model/_tp0_bk2/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e039781a4148e1dd0618aaeaa59cdbeeede45475e212500ec37542014ecafb73 +size 6083584 diff --git a/token_generation_model/_tp0_bk2/log-neuron-cc.txt b/token_generation_model/_tp0_bk2/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..eadb04ef74cedd5499266fd58e0971525d29f6b4 --- /dev/null +++ b/token_generation_model/_tp0_bk2/log-neuron-cc.txt @@ -0,0 +1,2931 @@ +2025-08-07T13:53:51Z INFO 47984 [root]: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.hlo_module.pb --output /home/ubuntu/qwen3/token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma' --lnc=1 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=/home/ubuntu/qwen3/token_generation_model/_tp0_bk2/log-neuron-cc.txt --verbose=35 +2025-08-07T13:53:51Z INFO 47984 [root]: NeuronX Compiler version 2.20.9961.0+0acef03a Python version 3.10.12 HWM version 2.20.0.9961+0acef03a NumPy version 1.26.4 Running on AMI ami-040348201d80b58ad Running in region usw2-az4 +2025-08-07T13:53:51Z INFO 48510 [root]: XLA detected +2025-08-07T13:53:51Z INFO 48510 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-08-07T13:53:51Z INFO 48510 [root]: Intermediate files stored in /home/ubuntu/qwen3/token_generation_model/_tp0_bk2/neuronxcc-gh4n9bnf, output in /home/ubuntu/qwen3/token_generation_model/_tp0_bk2 +2025-08-07T13:53:51Z INFO 48510 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-08-07T13:53:51Z INFO 48510 [pipeline.Pipeline.0]: Processing input #0 +2025-08-07T13:53:51Z INFO 48510 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-08-07T13:53:51Z INFO 48510 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-08-07T13:53:51Z INFO 48510 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-08-07T13:53:51Z INFO 48510 [job.HLOToTensorizer.0]: Processing input #0 +2025-08-07T13:53:51Z INFO 48510 [job.HLOToTensorizer.0]: IR signature: 6c544d0e20f56d5383d37218086cb993108067db2d992950c04a8fb8d9b4a59b for model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.hlo_module.pb +2025-08-07T13:53:51Z INFO 48510 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --layers-per-module=1 --emit-tensor-level-dropout-ops --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-08-07T13:53:52Z INFO 48510 [job.HLOToTensorizer.0]: DEBUG: needsModular? No. macCnt 3859693632 num non-trivial Ops 3786 +INFO: Switching to single-module compile. PrePartitionPipe skipped. +INFO: Found memory bound graph +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 2 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 3859619840 +INFO: Traffic has found 8267156077 +INFO: AIF 0.933724 +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call divide dot exponential gather get-tuple-element iota maximum multiply negate pad parameter reduce reshape rng scatter select sine slice subtract transpose tuple +Warning: Could not open file debug_info_hlo_partitions.json +2025-08-07 13:53:51.924276: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:52] Truncating long HLO operator name %tuple.13231 = tuple(%reshape.5201, %scatter.12235, %scatter.12250, %scatter.12263, %scatter.12278, %scatter.12291, %scatter.12306, %scatter.12319, %scatter.12334, %scatter.12347, %scatter.12362, %scatter.12375, %scatter.12390, %scatter.12403, %scatter.12418, %scatter.12431, %scatter.12446, %scatter.12459, %scatter.12474, %scatter.12487, %scatter.12502, %scatter.12515, %scatter.12530, %scatter.12543, %scatter.12558, %scatter.12571, %scatter.12586, %scatter.12599, %scatter.12614, %scatter.12627, %scatter.... to 512 characters in the compiler's debug metadata +Invoking RemoveOptimizationBarriers pass + +2025-08-07T13:53:52Z INFO 48510 [job.HLOToTensorizer.0]: IR signature: cc465aff7091a9cd65b2bc8cc785de0be406f71be5f66c255022f78bfdb6e607 for sg0000/HLOToTensorizer +2025-08-07T13:53:52Z INFO 48510 [job.HLOToTensorizer.0]: Job #0 finished +2025-08-07T13:53:52Z INFO 48510 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-08-07T13:53:52Z INFO 48510 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-08-07T13:53:52Z INFO 48510 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-08-07T13:53:52Z INFO 48510 [job.Frontend.0]: Processing input #0 +2025-08-07T13:53:52Z INFO 48510 [job.Frontend.0]: Start model loading +2025-08-07T13:53:52Z INFO 48510 [job.Frontend.0]: Start tensorization +2025-08-07T13:53:52Z INFO 48510 [job.Frontend.0]: Num jobs: 1 +2025-08-07T13:53:52Z USER 48510 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-08-07T13:53:52Z INFO 48510 [Tensorizer]: Frontend did not find netlist info. Switching to flat flow. +2025-08-07T13:53:52Z INFO 48510 [Tensorizer]: Building model from Penguin script "penguin.py"... +2025-08-07T13:53:53Z INFO 48510 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --keep-remat-dma-transpose --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-08-07T13:53:53Z INFO 48510 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:53Z INFO 48510 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48510 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:53Z INFO 48510 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-08-07T13:53:53Z INFO 48510 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48510 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.039 seconds +2025-08-07T13:53:53Z INFO 48510 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-08-07T13:53:53Z INFO 48510 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48510 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.019 seconds +2025-08-07T13:53:53Z INFO 48510 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:53Z INFO 48510 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48510 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.123 seconds +2025-08-07T13:53:53Z INFO 48510 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-08-07T13:53:53Z INFO 48510 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48510 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.073 seconds +2025-08-07T13:53:53Z INFO 48510 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-08-07T13:53:54Z INFO 48510 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48510 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.510 seconds +2025-08-07T13:53:54Z INFO 48510 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:54Z INFO 48510 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:54Z INFO 48510 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48510 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.005 seconds +2025-08-07T13:53:54Z INFO 48510 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:54Z INFO 48510 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48510 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.365 seconds +2025-08-07T13:53:54Z INFO 48510 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.402 seconds +2025-08-07T13:53:54Z INFO 48510 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.466 seconds +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.076 seconds +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.084 seconds +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.053 seconds +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.054 seconds +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.201 seconds +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.058 seconds +2025-08-07T13:53:55Z INFO 48510 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:56Z INFO 48510 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:56Z INFO 48510 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.787 seconds +2025-08-07T13:53:56Z INFO 48510 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:56Z INFO 48510 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48510 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.053 seconds +2025-08-07T13:53:56Z INFO 48510 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:56Z INFO 48510 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48510 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.057 seconds +2025-08-07T13:53:56Z INFO 48510 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:56Z INFO 48510 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48510 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.056 seconds +2025-08-07T13:53:56Z INFO 48510 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-08-07T13:53:56Z INFO 48510 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48510 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.065 seconds +2025-08-07T13:53:56Z INFO 48510 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:56Z INFO 48510 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:56Z INFO 48510 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.056 seconds +2025-08-07T13:53:56Z INFO 48510 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:57Z INFO 48510 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48510 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.183 seconds +2025-08-07T13:53:57Z INFO 48510 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:57Z INFO 48510 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48510 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.057 seconds +2025-08-07T13:53:57Z INFO 48510 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-08-07T13:53:58Z INFO 48510 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-08-07T13:53:58Z INFO 48510 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 1.534 seconds +2025-08-07T13:53:58Z INFO 48510 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-08-07T13:53:59Z INFO 48510 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 48510 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.363 seconds +2025-08-07T13:53:59Z INFO 48510 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:59Z INFO 48510 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:59Z INFO 48510 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 48510 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.006 seconds +2025-08-07T13:53:59Z INFO 48510 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:59Z INFO 48510 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 48510 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.443 seconds +2025-08-07T13:53:59Z INFO 48510 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.473 seconds +2025-08-07T13:53:59Z INFO 48510 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-08-07T13:54:04Z INFO 48510 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-08-07T13:54:04Z INFO 48510 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 4.920 seconds +2025-08-07T13:54:04Z INFO 48510 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:54:06Z INFO 48510 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:54:06Z INFO 48510 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 2.124 seconds +2025-08-07T13:54:06Z INFO 48510 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-08-07T13:54:06Z INFO 48510 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-08-07T13:54:06Z INFO 48510 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.168 seconds +2025-08-07T13:54:06Z INFO 48510 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:54:07Z INFO 48510 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:54:07Z INFO 48510 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.402 seconds +2025-08-07T13:54:07Z INFO 48510 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:54:07Z INFO 48510 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:54:07Z INFO 48510 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.515 seconds +2025-08-07T13:54:07Z INFO 48510 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:54:09Z INFO 48510 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=True) +2025-08-07T13:54:09Z INFO 48510 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 1.529 seconds +2025-08-07T13:54:09Z INFO 48510 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:54:09Z INFO 48510 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:54:09Z INFO 48510 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.131 seconds +2025-08-07T13:54:09Z INFO 48510 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:54:09Z INFO 48510 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:54:09Z INFO 48510 [sg0000/Tensorizer/LICM]: LICM finished after 0.086 seconds +2025-08-07T13:54:09Z INFO 48510 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:54:09Z INFO 48510 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:54:09Z INFO 48510 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.154 seconds +2025-08-07T13:54:09Z INFO 48510 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.541 seconds +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.037 seconds +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/LICM]: LICM finished after 0.065 seconds +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.253 seconds +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.136 seconds +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/LICM]: LICM finished after 0.062 seconds +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.009 seconds +2025-08-07T13:54:10Z INFO 48510 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.151 seconds +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.336 seconds +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.035 seconds +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.125 seconds +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/LICM]: LICM finished after 0.064 seconds +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.107 seconds +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.039 seconds +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.037 seconds +2025-08-07T13:54:11Z INFO 48510 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.210 seconds +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.184 seconds +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.417 seconds +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.007 seconds +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.039 seconds +2025-08-07T13:54:12Z INFO 48510 [Tensorizer]: After optimization: 1185 statements +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.049 seconds +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.036 seconds +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.126 seconds +2025-08-07T13:54:12Z INFO 48510 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-08-07T13:54:13Z INFO 48510 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=8192 is not above min_allgather_tile_size_in_bytes=8388608` +2025-08-07T13:54:13Z INFO 48510 [sg0000/Tensorizer/TileCCOps]: in bfloat16 (4096,) %'all_gather.1' = AllGatherOp-502 AllGather_add(bfloat16 (2048,) %'gather.1', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((4096,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.47 | hlo_id: 47 | , id = 502 +2025-08-07T13:54:13Z INFO 48510 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-08-07T13:54:13Z INFO 48510 [sg0000/Tensorizer/TileCCOps]: in float32 (512,) %'all_gather.2' = AllGatherOp-10901 AllGather_add(float32 (256,) %'add.217', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.12066 | hlo_id: 12066 | , id = 10901 +2025-08-07T13:54:13Z INFO 48510 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=2048 is not above min_allgather_tile_size_in_bytes=8388608` +2025-08-07T13:54:13Z INFO 48510 [sg0000/Tensorizer/TileCCOps]: in uint32 (512,) %'all_gather.3' = AllGatherOp-10917 AllGather_add(uint32 (256,) %'add.218', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.12201 | hlo_id: 12201 | , id = 10917 +2025-08-07T13:54:13Z INFO 48510 [sg0000/Tensorizer/TileCCOps]: Finished (changed=False) +2025-08-07T13:54:13Z INFO 48510 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.226 seconds +2025-08-07T13:54:13Z INFO 48510 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:54:13Z INFO 48510 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-08-07T13:54:13Z INFO 48510 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.495 seconds +2025-08-07T13:54:13Z INFO 48510 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:54:13Z INFO 48510 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:54:13Z INFO 48510 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.141 seconds +2025-08-07T13:54:13Z INFO 48510 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:54:14Z INFO 48510 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:54:14Z INFO 48510 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.335 seconds +2025-08-07T13:54:14Z INFO 48510 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:54:14Z INFO 48510 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:54:14Z INFO 48510 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.039 seconds +2025-08-07T13:54:14Z INFO 48510 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-08-07T13:54:14Z INFO 48510 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-08-07T13:54:14Z INFO 48510 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.046 seconds +2025-08-07T13:54:14Z INFO 48510 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-08-07T13:54:14Z INFO 48510 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-08-07T13:54:14Z INFO 48510 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.436 seconds +2025-08-07T13:54:14Z INFO 48510 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-08-07T13:54:15Z INFO 48510 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=True) +2025-08-07T13:54:15Z INFO 48510 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.256 seconds +2025-08-07T13:54:15Z INFO 48510 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:54:15Z INFO 48510 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:54:15Z INFO 48510 [sg0000/Tensorizer/LICM]: LICM finished after 0.079 seconds +2025-08-07T13:54:15Z INFO 48510 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-08-07T13:54:15Z INFO 48510 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=False) +2025-08-07T13:54:15Z INFO 48510 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.404 seconds +2025-08-07T13:54:15Z INFO 48510 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:54:15Z INFO 48510 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:54:15Z INFO 48510 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.349 seconds +2025-08-07T13:54:15Z INFO 48510 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-08-07T13:54:15Z INFO 48510 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:54:16Z INFO 48510 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:54:16Z INFO 48510 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.205 seconds +2025-08-07T13:54:16Z INFO 48510 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-08-07T13:54:16Z INFO 48510 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-08-07T13:54:16Z INFO 48510 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:54:16Z INFO 48510 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:54:16Z INFO 48510 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.212 seconds +2025-08-07T13:54:17Z INFO 48510 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-08-07T13:54:17Z INFO 48510 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 1.043 seconds +2025-08-07T13:54:17Z INFO 48510 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-08-07T13:54:17Z INFO 48510 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.332 seconds +2025-08-07T13:54:17Z INFO 48510 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 1.388 seconds +2025-08-07T13:54:17Z INFO 48510 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-08-07T13:54:17Z INFO 48510 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:54:18Z INFO 48510 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:54:21Z INFO 48510 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-08-07T13:54:21Z INFO 48510 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 4.240 seconds +2025-08-07T13:54:21Z INFO 48510 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-08-07T13:54:21Z INFO 48510 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-08-07T13:54:22Z INFO 48510 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-08-07T13:54:50Z INFO 48510 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-08-07T13:54:50Z INFO 48510 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 28.644 seconds +2025-08-07T13:54:50Z INFO 48510 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-08-07T13:54:51Z INFO 48510 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-08-07T13:54:51Z INFO 48510 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.773 seconds +2025-08-07T13:54:51Z INFO 48510 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 29.426 seconds +2025-08-07T13:54:51Z INFO 48510 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:54:51Z INFO 48510 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:54:51Z INFO 48510 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.144 seconds +2025-08-07T13:54:51Z INFO 48510 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-08-07T13:54:51Z INFO 48510 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-08-07T13:54:51Z INFO 48510 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.221 seconds +2025-08-07T13:54:51Z INFO 48510 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:54:51Z INFO 48510 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:54:51Z INFO 48510 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.239 seconds +2025-08-07T13:54:51Z INFO 48510 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-08-07T13:54:51Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11158 of IO tensor {'CrossPassTensor': ''}bfloat16 %input4|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(1, 'AG2803'), (260, 'AG2797'), (152, 'AG2801')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11435 of IO tensor {'CrossPassTensor': ''}bfloat16 %input6|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(3, 'AG2817'), (260, 'AG2797'), (155, 'AG2815')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11686 of IO tensor {'CrossPassTensor': ''}bfloat16 %input8|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(5, 'AG2829'), (260, 'AG2797'), (158, 'AG2827')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11937 of IO tensor {'CrossPassTensor': ''}bfloat16 %input10|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(7, 'AG2841'), (260, 'AG2797'), (161, 'AG2839')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12188 of IO tensor {'CrossPassTensor': ''}bfloat16 %input12|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(9, 'AG2853'), (260, 'AG2797'), (164, 'AG2851')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12439 of IO tensor {'CrossPassTensor': ''}bfloat16 %input14|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(11, 'AG2865'), (260, 'AG2797'), (167, 'AG2863')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12690 of IO tensor {'CrossPassTensor': ''}bfloat16 %input16|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(13, 'AG2877'), (260, 'AG2797'), (170, 'AG2875')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12941 of IO tensor {'CrossPassTensor': ''}bfloat16 %input18|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(15, 'AG2889'), (260, 'AG2797'), (173, 'AG2887')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13192 of IO tensor {'CrossPassTensor': ''}bfloat16 %input20|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(17, 'AG2901'), (260, 'AG2797'), (176, 'AG2899')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13443 of IO tensor {'CrossPassTensor': ''}bfloat16 %input22|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(19, 'AG2913'), (260, 'AG2797'), (179, 'AG2911')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13694 of IO tensor {'CrossPassTensor': ''}bfloat16 %input24|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(21, 'AG2925'), (260, 'AG2797'), (182, 'AG2923')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13945 of IO tensor {'CrossPassTensor': ''}bfloat16 %input26|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(23, 'AG2937'), (260, 'AG2797'), (185, 'AG2935')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14196 of IO tensor {'CrossPassTensor': ''}bfloat16 %input28|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(25, 'AG2949'), (260, 'AG2797'), (188, 'AG2947')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14447 of IO tensor {'CrossPassTensor': ''}bfloat16 %input30|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(27, 'AG2961'), (260, 'AG2797'), (191, 'AG2959')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14698 of IO tensor {'CrossPassTensor': ''}bfloat16 %input32|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(29, 'AG2973'), (260, 'AG2797'), (194, 'AG2971')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14949 of IO tensor {'CrossPassTensor': ''}bfloat16 %input34|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(31, 'AG2985'), (260, 'AG2797'), (197, 'AG2983')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15200 of IO tensor {'CrossPassTensor': ''}bfloat16 %input36|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(33, 'AG2997'), (260, 'AG2797'), (200, 'AG2995')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15451 of IO tensor {'CrossPassTensor': ''}bfloat16 %input38|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(35, 'AG3009'), (260, 'AG2797'), (203, 'AG3007')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15702 of IO tensor {'CrossPassTensor': ''}bfloat16 %input40|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(37, 'AG3021'), (260, 'AG2797'), (206, 'AG3019')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15953 of IO tensor {'CrossPassTensor': ''}bfloat16 %input42|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(39, 'AG3033'), (260, 'AG2797'), (209, 'AG3031')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16204 of IO tensor {'CrossPassTensor': ''}bfloat16 %input44|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(41, 'AG3045'), (260, 'AG2797'), (212, 'AG3043')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16455 of IO tensor {'CrossPassTensor': ''}bfloat16 %input46|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(43, 'AG3057'), (260, 'AG2797'), (215, 'AG3055')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16706 of IO tensor {'CrossPassTensor': ''}bfloat16 %input48|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(45, 'AG3069'), (260, 'AG2797'), (218, 'AG3067')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16957 of IO tensor {'CrossPassTensor': ''}bfloat16 %input50|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(47, 'AG3081'), (260, 'AG2797'), (221, 'AG3079')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17208 of IO tensor {'CrossPassTensor': ''}bfloat16 %input52|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(49, 'AG3093'), (260, 'AG2797'), (224, 'AG3091')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17459 of IO tensor {'CrossPassTensor': ''}bfloat16 %input54|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(51, 'AG3105'), (260, 'AG2797'), (227, 'AG3103')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17710 of IO tensor {'CrossPassTensor': ''}bfloat16 %input56|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(53, 'AG3117'), (260, 'AG2797'), (230, 'AG3115')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17961 of IO tensor {'CrossPassTensor': ''}bfloat16 %input58|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(55, 'AG3129'), (260, 'AG2797'), (233, 'AG3127')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18212 of IO tensor {'CrossPassTensor': ''}bfloat16 %input60|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(57, 'AG3141'), (260, 'AG2797'), (236, 'AG3139')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18463 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(59, 'AG3153'), (260, 'AG2797'), (239, 'AG3151')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18714 of IO tensor {'CrossPassTensor': ''}bfloat16 %input64|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(61, 'AG3165'), (260, 'AG2797'), (242, 'AG3163')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18965 of IO tensor {'CrossPassTensor': ''}bfloat16 %input66|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(63, 'AG3177'), (260, 'AG2797'), (245, 'AG3175')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19216 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(65, 'AG3189'), (260, 'AG2797'), (248, 'AG3187')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19467 of IO tensor {'CrossPassTensor': ''}bfloat16 %input70|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(67, 'AG3201'), (260, 'AG2797'), (251, 'AG3199')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19718 of IO tensor {'CrossPassTensor': ''}bfloat16 %input72|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(69, 'AG3213'), (260, 'AG2797'), (254, 'AG3211')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19969 of IO tensor {'CrossPassTensor': ''}bfloat16 %input74|NHWC|(1, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(71, 'AG3225'), (260, 'AG2797'), (257, 'AG3223')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11309 of IO tensor {'CrossPassTensor': ''}bfloat16 %input77(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(153, 'AG2807'), (1, 'AG2803'), (80, 'AG2802'), (264, 'AG2806'), (409, 'AG2805')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28661 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(408, 'AG2798'), (261, 'AG2799')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28652 of IO tensor {'CrossPassTensor': ''}bfloat16 %input79|NC|(128, 32) is not sorted, index list (w/ AG ids): [(408, 'AG2798'), (261, 'AG2799')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28658 of IO tensor {'CrossPassTensor': ''}bfloat16 %input81(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(408, 'AG2798'), (261, 'AG2799')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28653 of IO tensor {'CrossPassTensor': ''}bfloat16 %input83(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(408, 'AG2798'), (261, 'AG2799')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28666 of IO tensor {'CrossPassTensor': ''}bfloat16 %input84(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(81, 'AG2812'), (266, 'AG2810'), (154, 'AG2811')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28665 of IO tensor {'CrossPassTensor': ''}bfloat16 %input85|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28663 of IO tensor {'CrossPassTensor': ''}bfloat16 %input86|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28664 of IO tensor {'CrossPassTensor': ''}bfloat16 %input87|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11582 of IO tensor {'CrossPassTensor': ''}bfloat16 %input88(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(156, 'AG2821'), (3, 'AG2817'), (82, 'AG2816'), (269, 'AG2820'), (411, 'AG2819')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28674 of IO tensor {'CrossPassTensor': ''}bfloat16 %input89|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28667 of IO tensor {'CrossPassTensor': ''}bfloat16 %input90|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28671 of IO tensor {'CrossPassTensor': ''}bfloat16 %input92(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28668 of IO tensor {'CrossPassTensor': ''}bfloat16 %input94(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28679 of IO tensor {'CrossPassTensor': ''}bfloat16 %input95(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(83, 'AG2824'), (270, 'AG2822'), (157, 'AG2823')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28678 of IO tensor {'CrossPassTensor': ''}bfloat16 %input96|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28676 of IO tensor {'CrossPassTensor': ''}bfloat16 %input97|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28677 of IO tensor {'CrossPassTensor': ''}bfloat16 %input98|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11833 of IO tensor {'CrossPassTensor': ''}bfloat16 %input99(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(159, 'AG2833'), (5, 'AG2829'), (84, 'AG2828'), (273, 'AG2832'), (412, 'AG2831')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28687 of IO tensor {'CrossPassTensor': ''}bfloat16 %input100|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28680 of IO tensor {'CrossPassTensor': ''}bfloat16 %input101|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28684 of IO tensor {'CrossPassTensor': ''}bfloat16 %input103(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28681 of IO tensor {'CrossPassTensor': ''}bfloat16 %input105(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28692 of IO tensor {'CrossPassTensor': ''}bfloat16 %input106(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(85, 'AG2836'), (274, 'AG2834'), (160, 'AG2835')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28691 of IO tensor {'CrossPassTensor': ''}bfloat16 %input107|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28689 of IO tensor {'CrossPassTensor': ''}bfloat16 %input108|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28690 of IO tensor {'CrossPassTensor': ''}bfloat16 %input109|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12084 of IO tensor {'CrossPassTensor': ''}bfloat16 %input110(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(162, 'AG2845'), (7, 'AG2841'), (86, 'AG2840'), (277, 'AG2844'), (413, 'AG2843')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28700 of IO tensor {'CrossPassTensor': ''}bfloat16 %input111|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28693 of IO tensor {'CrossPassTensor': ''}bfloat16 %input112|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28697 of IO tensor {'CrossPassTensor': ''}bfloat16 %input114(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28694 of IO tensor {'CrossPassTensor': ''}bfloat16 %input116(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28705 of IO tensor {'CrossPassTensor': ''}bfloat16 %input117(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(87, 'AG2848'), (278, 'AG2846'), (163, 'AG2847')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28704 of IO tensor {'CrossPassTensor': ''}bfloat16 %input118|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28702 of IO tensor {'CrossPassTensor': ''}bfloat16 %input119|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28703 of IO tensor {'CrossPassTensor': ''}bfloat16 %input120|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12335 of IO tensor {'CrossPassTensor': ''}bfloat16 %input121(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(165, 'AG2857'), (9, 'AG2853'), (88, 'AG2852'), (281, 'AG2856'), (414, 'AG2855')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28713 of IO tensor {'CrossPassTensor': ''}bfloat16 %input122|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28706 of IO tensor {'CrossPassTensor': ''}bfloat16 %input123|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28710 of IO tensor {'CrossPassTensor': ''}bfloat16 %input125(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28707 of IO tensor {'CrossPassTensor': ''}bfloat16 %input127(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28718 of IO tensor {'CrossPassTensor': ''}bfloat16 %input128(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(89, 'AG2860'), (282, 'AG2858'), (166, 'AG2859')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28717 of IO tensor {'CrossPassTensor': ''}bfloat16 %input129|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28715 of IO tensor {'CrossPassTensor': ''}bfloat16 %input130|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28716 of IO tensor {'CrossPassTensor': ''}bfloat16 %input131|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12586 of IO tensor {'CrossPassTensor': ''}bfloat16 %input132(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(168, 'AG2869'), (11, 'AG2865'), (90, 'AG2864'), (285, 'AG2868'), (415, 'AG2867')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28726 of IO tensor {'CrossPassTensor': ''}bfloat16 %input133|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28719 of IO tensor {'CrossPassTensor': ''}bfloat16 %input134|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28723 of IO tensor {'CrossPassTensor': ''}bfloat16 %input136(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28720 of IO tensor {'CrossPassTensor': ''}bfloat16 %input138(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28731 of IO tensor {'CrossPassTensor': ''}bfloat16 %input139(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(91, 'AG2872'), (286, 'AG2870'), (169, 'AG2871')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28730 of IO tensor {'CrossPassTensor': ''}bfloat16 %input140|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28728 of IO tensor {'CrossPassTensor': ''}bfloat16 %input141|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28729 of IO tensor {'CrossPassTensor': ''}bfloat16 %input142|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12837 of IO tensor {'CrossPassTensor': ''}bfloat16 %input143(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(171, 'AG2881'), (13, 'AG2877'), (92, 'AG2876'), (289, 'AG2880'), (416, 'AG2879')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28739 of IO tensor {'CrossPassTensor': ''}bfloat16 %input144|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28732 of IO tensor {'CrossPassTensor': ''}bfloat16 %input145|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28736 of IO tensor {'CrossPassTensor': ''}bfloat16 %input147(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28733 of IO tensor {'CrossPassTensor': ''}bfloat16 %input149(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28744 of IO tensor {'CrossPassTensor': ''}bfloat16 %input150(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(93, 'AG2884'), (290, 'AG2882'), (172, 'AG2883')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28743 of IO tensor {'CrossPassTensor': ''}bfloat16 %input151|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28741 of IO tensor {'CrossPassTensor': ''}bfloat16 %input152|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28742 of IO tensor {'CrossPassTensor': ''}bfloat16 %input153|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13088 of IO tensor {'CrossPassTensor': ''}bfloat16 %input154(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(174, 'AG2893'), (15, 'AG2889'), (94, 'AG2888'), (293, 'AG2892'), (417, 'AG2891')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28752 of IO tensor {'CrossPassTensor': ''}bfloat16 %input155|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28745 of IO tensor {'CrossPassTensor': ''}bfloat16 %input156|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28749 of IO tensor {'CrossPassTensor': ''}bfloat16 %input158(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28746 of IO tensor {'CrossPassTensor': ''}bfloat16 %input160(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28757 of IO tensor {'CrossPassTensor': ''}bfloat16 %input161(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(95, 'AG2896'), (294, 'AG2894'), (175, 'AG2895')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28756 of IO tensor {'CrossPassTensor': ''}bfloat16 %input162|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28754 of IO tensor {'CrossPassTensor': ''}bfloat16 %input163|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28755 of IO tensor {'CrossPassTensor': ''}bfloat16 %input164|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13339 of IO tensor {'CrossPassTensor': ''}bfloat16 %input165(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(177, 'AG2905'), (17, 'AG2901'), (96, 'AG2900'), (297, 'AG2904'), (418, 'AG2903')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28765 of IO tensor {'CrossPassTensor': ''}bfloat16 %input166|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28758 of IO tensor {'CrossPassTensor': ''}bfloat16 %input167|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28762 of IO tensor {'CrossPassTensor': ''}bfloat16 %input169(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28759 of IO tensor {'CrossPassTensor': ''}bfloat16 %input171(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28770 of IO tensor {'CrossPassTensor': ''}bfloat16 %input172(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(97, 'AG2908'), (298, 'AG2906'), (178, 'AG2907')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28769 of IO tensor {'CrossPassTensor': ''}bfloat16 %input173|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28767 of IO tensor {'CrossPassTensor': ''}bfloat16 %input174|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28768 of IO tensor {'CrossPassTensor': ''}bfloat16 %input175|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13590 of IO tensor {'CrossPassTensor': ''}bfloat16 %input176(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(180, 'AG2917'), (19, 'AG2913'), (98, 'AG2912'), (301, 'AG2916'), (419, 'AG2915')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28778 of IO tensor {'CrossPassTensor': ''}bfloat16 %input177|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28771 of IO tensor {'CrossPassTensor': ''}bfloat16 %input178|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28775 of IO tensor {'CrossPassTensor': ''}bfloat16 %input180(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28772 of IO tensor {'CrossPassTensor': ''}bfloat16 %input182(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28783 of IO tensor {'CrossPassTensor': ''}bfloat16 %input183(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(99, 'AG2920'), (302, 'AG2918'), (181, 'AG2919')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28782 of IO tensor {'CrossPassTensor': ''}bfloat16 %input184|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28780 of IO tensor {'CrossPassTensor': ''}bfloat16 %input185|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28781 of IO tensor {'CrossPassTensor': ''}bfloat16 %input186|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13841 of IO tensor {'CrossPassTensor': ''}bfloat16 %input187(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(183, 'AG2929'), (21, 'AG2925'), (100, 'AG2924'), (305, 'AG2928'), (420, 'AG2927')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28791 of IO tensor {'CrossPassTensor': ''}bfloat16 %input188|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28784 of IO tensor {'CrossPassTensor': ''}bfloat16 %input189|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28788 of IO tensor {'CrossPassTensor': ''}bfloat16 %input191(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28785 of IO tensor {'CrossPassTensor': ''}bfloat16 %input193(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28796 of IO tensor {'CrossPassTensor': ''}bfloat16 %input194(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(101, 'AG2932'), (306, 'AG2930'), (184, 'AG2931')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28795 of IO tensor {'CrossPassTensor': ''}bfloat16 %input195|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28793 of IO tensor {'CrossPassTensor': ''}bfloat16 %input196|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28794 of IO tensor {'CrossPassTensor': ''}bfloat16 %input197|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14092 of IO tensor {'CrossPassTensor': ''}bfloat16 %input198(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(186, 'AG2941'), (23, 'AG2937'), (102, 'AG2936'), (309, 'AG2940'), (421, 'AG2939')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28804 of IO tensor {'CrossPassTensor': ''}bfloat16 %input199|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28797 of IO tensor {'CrossPassTensor': ''}bfloat16 %input200|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28801 of IO tensor {'CrossPassTensor': ''}bfloat16 %input202(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28798 of IO tensor {'CrossPassTensor': ''}bfloat16 %input204(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28809 of IO tensor {'CrossPassTensor': ''}bfloat16 %input205(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(103, 'AG2944'), (310, 'AG2942'), (187, 'AG2943')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28808 of IO tensor {'CrossPassTensor': ''}bfloat16 %input206|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28806 of IO tensor {'CrossPassTensor': ''}bfloat16 %input207|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28807 of IO tensor {'CrossPassTensor': ''}bfloat16 %input208|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14343 of IO tensor {'CrossPassTensor': ''}bfloat16 %input209(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(189, 'AG2953'), (25, 'AG2949'), (104, 'AG2948'), (313, 'AG2952'), (422, 'AG2951')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28817 of IO tensor {'CrossPassTensor': ''}bfloat16 %input210|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28810 of IO tensor {'CrossPassTensor': ''}bfloat16 %input211|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28814 of IO tensor {'CrossPassTensor': ''}bfloat16 %input213(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28811 of IO tensor {'CrossPassTensor': ''}bfloat16 %input215(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28822 of IO tensor {'CrossPassTensor': ''}bfloat16 %input216(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(105, 'AG2956'), (314, 'AG2954'), (190, 'AG2955')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28821 of IO tensor {'CrossPassTensor': ''}bfloat16 %input217|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28819 of IO tensor {'CrossPassTensor': ''}bfloat16 %input218|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28820 of IO tensor {'CrossPassTensor': ''}bfloat16 %input219|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14594 of IO tensor {'CrossPassTensor': ''}bfloat16 %input220(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(192, 'AG2965'), (27, 'AG2961'), (106, 'AG2960'), (317, 'AG2964'), (423, 'AG2963')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28830 of IO tensor {'CrossPassTensor': ''}bfloat16 %input221|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28823 of IO tensor {'CrossPassTensor': ''}bfloat16 %input222|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28827 of IO tensor {'CrossPassTensor': ''}bfloat16 %input224(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28824 of IO tensor {'CrossPassTensor': ''}bfloat16 %input226(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28835 of IO tensor {'CrossPassTensor': ''}bfloat16 %input227(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(107, 'AG2968'), (318, 'AG2966'), (193, 'AG2967')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28834 of IO tensor {'CrossPassTensor': ''}bfloat16 %input228|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28832 of IO tensor {'CrossPassTensor': ''}bfloat16 %input229|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28833 of IO tensor {'CrossPassTensor': ''}bfloat16 %input230|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14845 of IO tensor {'CrossPassTensor': ''}bfloat16 %input231(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(195, 'AG2977'), (29, 'AG2973'), (108, 'AG2972'), (321, 'AG2976'), (424, 'AG2975')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28843 of IO tensor {'CrossPassTensor': ''}bfloat16 %input232|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28836 of IO tensor {'CrossPassTensor': ''}bfloat16 %input233|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28840 of IO tensor {'CrossPassTensor': ''}bfloat16 %input235(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28837 of IO tensor {'CrossPassTensor': ''}bfloat16 %input237(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28848 of IO tensor {'CrossPassTensor': ''}bfloat16 %input238(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(109, 'AG2980'), (322, 'AG2978'), (196, 'AG2979')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28847 of IO tensor {'CrossPassTensor': ''}bfloat16 %input239|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28845 of IO tensor {'CrossPassTensor': ''}bfloat16 %input240|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28846 of IO tensor {'CrossPassTensor': ''}bfloat16 %input241|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15096 of IO tensor {'CrossPassTensor': ''}bfloat16 %input242(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(198, 'AG2989'), (31, 'AG2985'), (110, 'AG2984'), (325, 'AG2988'), (425, 'AG2987')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28856 of IO tensor {'CrossPassTensor': ''}bfloat16 %input243|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28849 of IO tensor {'CrossPassTensor': ''}bfloat16 %input244|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28853 of IO tensor {'CrossPassTensor': ''}bfloat16 %input246(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28850 of IO tensor {'CrossPassTensor': ''}bfloat16 %input248(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28861 of IO tensor {'CrossPassTensor': ''}bfloat16 %input249(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(111, 'AG2992'), (326, 'AG2990'), (199, 'AG2991')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28860 of IO tensor {'CrossPassTensor': ''}bfloat16 %input250|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28858 of IO tensor {'CrossPassTensor': ''}bfloat16 %input251|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28859 of IO tensor {'CrossPassTensor': ''}bfloat16 %input252|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15347 of IO tensor {'CrossPassTensor': ''}bfloat16 %input253(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(201, 'AG3001'), (33, 'AG2997'), (112, 'AG2996'), (329, 'AG3000'), (426, 'AG2999')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28869 of IO tensor {'CrossPassTensor': ''}bfloat16 %input254|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28862 of IO tensor {'CrossPassTensor': ''}bfloat16 %input255|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28866 of IO tensor {'CrossPassTensor': ''}bfloat16 %input257(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28863 of IO tensor {'CrossPassTensor': ''}bfloat16 %input259(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28874 of IO tensor {'CrossPassTensor': ''}bfloat16 %input260(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(113, 'AG3004'), (330, 'AG3002'), (202, 'AG3003')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28873 of IO tensor {'CrossPassTensor': ''}bfloat16 %input261|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28871 of IO tensor {'CrossPassTensor': ''}bfloat16 %input262|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28872 of IO tensor {'CrossPassTensor': ''}bfloat16 %input263|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15598 of IO tensor {'CrossPassTensor': ''}bfloat16 %input264(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(204, 'AG3013'), (35, 'AG3009'), (114, 'AG3008'), (333, 'AG3012'), (427, 'AG3011')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28882 of IO tensor {'CrossPassTensor': ''}bfloat16 %input265|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28875 of IO tensor {'CrossPassTensor': ''}bfloat16 %input266|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28879 of IO tensor {'CrossPassTensor': ''}bfloat16 %input268(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28876 of IO tensor {'CrossPassTensor': ''}bfloat16 %input270(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28887 of IO tensor {'CrossPassTensor': ''}bfloat16 %input271(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(115, 'AG3016'), (334, 'AG3014'), (205, 'AG3015')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28886 of IO tensor {'CrossPassTensor': ''}bfloat16 %input272|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28884 of IO tensor {'CrossPassTensor': ''}bfloat16 %input273|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28885 of IO tensor {'CrossPassTensor': ''}bfloat16 %input274|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15849 of IO tensor {'CrossPassTensor': ''}bfloat16 %input275(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(207, 'AG3025'), (37, 'AG3021'), (116, 'AG3020'), (337, 'AG3024'), (428, 'AG3023')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28895 of IO tensor {'CrossPassTensor': ''}bfloat16 %input276|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28888 of IO tensor {'CrossPassTensor': ''}bfloat16 %input277|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28892 of IO tensor {'CrossPassTensor': ''}bfloat16 %input279(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28889 of IO tensor {'CrossPassTensor': ''}bfloat16 %input281(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28900 of IO tensor {'CrossPassTensor': ''}bfloat16 %input282(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(117, 'AG3028'), (338, 'AG3026'), (208, 'AG3027')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28899 of IO tensor {'CrossPassTensor': ''}bfloat16 %input283|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28897 of IO tensor {'CrossPassTensor': ''}bfloat16 %input284|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28898 of IO tensor {'CrossPassTensor': ''}bfloat16 %input285|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16100 of IO tensor {'CrossPassTensor': ''}bfloat16 %input286(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(210, 'AG3037'), (39, 'AG3033'), (118, 'AG3032'), (341, 'AG3036'), (429, 'AG3035')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28908 of IO tensor {'CrossPassTensor': ''}bfloat16 %input287|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28901 of IO tensor {'CrossPassTensor': ''}bfloat16 %input288|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28905 of IO tensor {'CrossPassTensor': ''}bfloat16 %input290(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28902 of IO tensor {'CrossPassTensor': ''}bfloat16 %input292(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28913 of IO tensor {'CrossPassTensor': ''}bfloat16 %input293(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(119, 'AG3040'), (342, 'AG3038'), (211, 'AG3039')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28912 of IO tensor {'CrossPassTensor': ''}bfloat16 %input294|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28910 of IO tensor {'CrossPassTensor': ''}bfloat16 %input295|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28911 of IO tensor {'CrossPassTensor': ''}bfloat16 %input296|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16351 of IO tensor {'CrossPassTensor': ''}bfloat16 %input297(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(213, 'AG3049'), (41, 'AG3045'), (120, 'AG3044'), (345, 'AG3048'), (430, 'AG3047')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28921 of IO tensor {'CrossPassTensor': ''}bfloat16 %input298|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28914 of IO tensor {'CrossPassTensor': ''}bfloat16 %input299|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28918 of IO tensor {'CrossPassTensor': ''}bfloat16 %input301(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28915 of IO tensor {'CrossPassTensor': ''}bfloat16 %input303(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28926 of IO tensor {'CrossPassTensor': ''}bfloat16 %input304(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(121, 'AG3052'), (346, 'AG3050'), (214, 'AG3051')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28925 of IO tensor {'CrossPassTensor': ''}bfloat16 %input305|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28923 of IO tensor {'CrossPassTensor': ''}bfloat16 %input306|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28924 of IO tensor {'CrossPassTensor': ''}bfloat16 %input307|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16602 of IO tensor {'CrossPassTensor': ''}bfloat16 %input308(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(216, 'AG3061'), (43, 'AG3057'), (122, 'AG3056'), (349, 'AG3060'), (431, 'AG3059')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28934 of IO tensor {'CrossPassTensor': ''}bfloat16 %input309|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28927 of IO tensor {'CrossPassTensor': ''}bfloat16 %input310|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28931 of IO tensor {'CrossPassTensor': ''}bfloat16 %input312(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28928 of IO tensor {'CrossPassTensor': ''}bfloat16 %input314(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28939 of IO tensor {'CrossPassTensor': ''}bfloat16 %input315(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(123, 'AG3064'), (350, 'AG3062'), (217, 'AG3063')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28938 of IO tensor {'CrossPassTensor': ''}bfloat16 %input316|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28936 of IO tensor {'CrossPassTensor': ''}bfloat16 %input317|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28937 of IO tensor {'CrossPassTensor': ''}bfloat16 %input318|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16853 of IO tensor {'CrossPassTensor': ''}bfloat16 %input319(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(219, 'AG3073'), (45, 'AG3069'), (124, 'AG3068'), (353, 'AG3072'), (432, 'AG3071')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28947 of IO tensor {'CrossPassTensor': ''}bfloat16 %input320|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28940 of IO tensor {'CrossPassTensor': ''}bfloat16 %input321|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28944 of IO tensor {'CrossPassTensor': ''}bfloat16 %input323(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28941 of IO tensor {'CrossPassTensor': ''}bfloat16 %input325(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28952 of IO tensor {'CrossPassTensor': ''}bfloat16 %input326(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(125, 'AG3076'), (354, 'AG3074'), (220, 'AG3075')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28951 of IO tensor {'CrossPassTensor': ''}bfloat16 %input327|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28949 of IO tensor {'CrossPassTensor': ''}bfloat16 %input328|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28950 of IO tensor {'CrossPassTensor': ''}bfloat16 %input329|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17104 of IO tensor {'CrossPassTensor': ''}bfloat16 %input330(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(222, 'AG3085'), (47, 'AG3081'), (126, 'AG3080'), (357, 'AG3084'), (433, 'AG3083')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28960 of IO tensor {'CrossPassTensor': ''}bfloat16 %input331|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28953 of IO tensor {'CrossPassTensor': ''}bfloat16 %input332|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28957 of IO tensor {'CrossPassTensor': ''}bfloat16 %input334(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28954 of IO tensor {'CrossPassTensor': ''}bfloat16 %input336(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28965 of IO tensor {'CrossPassTensor': ''}bfloat16 %input337(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(127, 'AG3088'), (358, 'AG3086'), (223, 'AG3087')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28964 of IO tensor {'CrossPassTensor': ''}bfloat16 %input338|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28962 of IO tensor {'CrossPassTensor': ''}bfloat16 %input339|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28963 of IO tensor {'CrossPassTensor': ''}bfloat16 %input340|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17355 of IO tensor {'CrossPassTensor': ''}bfloat16 %input341(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(225, 'AG3097'), (49, 'AG3093'), (128, 'AG3092'), (361, 'AG3096'), (434, 'AG3095')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28973 of IO tensor {'CrossPassTensor': ''}bfloat16 %input342|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28966 of IO tensor {'CrossPassTensor': ''}bfloat16 %input343|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28970 of IO tensor {'CrossPassTensor': ''}bfloat16 %input345(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28967 of IO tensor {'CrossPassTensor': ''}bfloat16 %input347(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28978 of IO tensor {'CrossPassTensor': ''}bfloat16 %input348(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(129, 'AG3100'), (362, 'AG3098'), (226, 'AG3099')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28977 of IO tensor {'CrossPassTensor': ''}bfloat16 %input349|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28975 of IO tensor {'CrossPassTensor': ''}bfloat16 %input350|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28976 of IO tensor {'CrossPassTensor': ''}bfloat16 %input351|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17606 of IO tensor {'CrossPassTensor': ''}bfloat16 %input352(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(228, 'AG3109'), (51, 'AG3105'), (130, 'AG3104'), (365, 'AG3108'), (435, 'AG3107')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28986 of IO tensor {'CrossPassTensor': ''}bfloat16 %input353|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28979 of IO tensor {'CrossPassTensor': ''}bfloat16 %input354|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28983 of IO tensor {'CrossPassTensor': ''}bfloat16 %input356(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28980 of IO tensor {'CrossPassTensor': ''}bfloat16 %input358(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28991 of IO tensor {'CrossPassTensor': ''}bfloat16 %input359(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(131, 'AG3112'), (366, 'AG3110'), (229, 'AG3111')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28990 of IO tensor {'CrossPassTensor': ''}bfloat16 %input360|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28988 of IO tensor {'CrossPassTensor': ''}bfloat16 %input361|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28989 of IO tensor {'CrossPassTensor': ''}bfloat16 %input362|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17857 of IO tensor {'CrossPassTensor': ''}bfloat16 %input363(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(231, 'AG3121'), (53, 'AG3117'), (132, 'AG3116'), (369, 'AG3120'), (436, 'AG3119')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28999 of IO tensor {'CrossPassTensor': ''}bfloat16 %input364|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28992 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28996 of IO tensor {'CrossPassTensor': ''}bfloat16 %input367(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28993 of IO tensor {'CrossPassTensor': ''}bfloat16 %input369(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29004 of IO tensor {'CrossPassTensor': ''}bfloat16 %input370(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(133, 'AG3124'), (370, 'AG3122'), (232, 'AG3123')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29003 of IO tensor {'CrossPassTensor': ''}bfloat16 %input371|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29001 of IO tensor {'CrossPassTensor': ''}bfloat16 %input372|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29002 of IO tensor {'CrossPassTensor': ''}bfloat16 %input373|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18108 of IO tensor {'CrossPassTensor': ''}bfloat16 %input374(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(234, 'AG3133'), (55, 'AG3129'), (134, 'AG3128'), (373, 'AG3132'), (437, 'AG3131')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29012 of IO tensor {'CrossPassTensor': ''}bfloat16 %input375|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29005 of IO tensor {'CrossPassTensor': ''}bfloat16 %input376|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29009 of IO tensor {'CrossPassTensor': ''}bfloat16 %input378(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29006 of IO tensor {'CrossPassTensor': ''}bfloat16 %input380(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29017 of IO tensor {'CrossPassTensor': ''}bfloat16 %input381(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(135, 'AG3136'), (374, 'AG3134'), (235, 'AG3135')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29016 of IO tensor {'CrossPassTensor': ''}bfloat16 %input382|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29014 of IO tensor {'CrossPassTensor': ''}bfloat16 %input383|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29015 of IO tensor {'CrossPassTensor': ''}bfloat16 %input384|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18359 of IO tensor {'CrossPassTensor': ''}bfloat16 %input385(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(237, 'AG3145'), (57, 'AG3141'), (136, 'AG3140'), (377, 'AG3144'), (438, 'AG3143')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29025 of IO tensor {'CrossPassTensor': ''}bfloat16 %input386|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29018 of IO tensor {'CrossPassTensor': ''}bfloat16 %input387|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29022 of IO tensor {'CrossPassTensor': ''}bfloat16 %input389(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29019 of IO tensor {'CrossPassTensor': ''}bfloat16 %input391(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29030 of IO tensor {'CrossPassTensor': ''}bfloat16 %input392(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(137, 'AG3148'), (378, 'AG3146'), (238, 'AG3147')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29029 of IO tensor {'CrossPassTensor': ''}bfloat16 %input393|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29027 of IO tensor {'CrossPassTensor': ''}bfloat16 %input394|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29028 of IO tensor {'CrossPassTensor': ''}bfloat16 %input395|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18610 of IO tensor {'CrossPassTensor': ''}bfloat16 %input396(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(240, 'AG3157'), (59, 'AG3153'), (138, 'AG3152'), (381, 'AG3156'), (439, 'AG3155')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29038 of IO tensor {'CrossPassTensor': ''}bfloat16 %input397|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29031 of IO tensor {'CrossPassTensor': ''}bfloat16 %input398|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29035 of IO tensor {'CrossPassTensor': ''}bfloat16 %input400(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29032 of IO tensor {'CrossPassTensor': ''}bfloat16 %input402(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29043 of IO tensor {'CrossPassTensor': ''}bfloat16 %input403(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(139, 'AG3160'), (382, 'AG3158'), (241, 'AG3159')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29042 of IO tensor {'CrossPassTensor': ''}bfloat16 %input404|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29040 of IO tensor {'CrossPassTensor': ''}bfloat16 %input405|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29041 of IO tensor {'CrossPassTensor': ''}bfloat16 %input406|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18861 of IO tensor {'CrossPassTensor': ''}bfloat16 %input407(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(243, 'AG3169'), (61, 'AG3165'), (140, 'AG3164'), (385, 'AG3168'), (440, 'AG3167')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29051 of IO tensor {'CrossPassTensor': ''}bfloat16 %input408|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29044 of IO tensor {'CrossPassTensor': ''}bfloat16 %input409|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29048 of IO tensor {'CrossPassTensor': ''}bfloat16 %input411(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29045 of IO tensor {'CrossPassTensor': ''}bfloat16 %input413(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29056 of IO tensor {'CrossPassTensor': ''}bfloat16 %input414(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(141, 'AG3172'), (386, 'AG3170'), (244, 'AG3171')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29055 of IO tensor {'CrossPassTensor': ''}bfloat16 %input415|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29053 of IO tensor {'CrossPassTensor': ''}bfloat16 %input416|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29054 of IO tensor {'CrossPassTensor': ''}bfloat16 %input417|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19112 of IO tensor {'CrossPassTensor': ''}bfloat16 %input418(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(246, 'AG3181'), (63, 'AG3177'), (142, 'AG3176'), (389, 'AG3180'), (441, 'AG3179')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29064 of IO tensor {'CrossPassTensor': ''}bfloat16 %input419|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29057 of IO tensor {'CrossPassTensor': ''}bfloat16 %input420|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29061 of IO tensor {'CrossPassTensor': ''}bfloat16 %input422(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29058 of IO tensor {'CrossPassTensor': ''}bfloat16 %input424(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29069 of IO tensor {'CrossPassTensor': ''}bfloat16 %input425(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(143, 'AG3184'), (390, 'AG3182'), (247, 'AG3183')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29068 of IO tensor {'CrossPassTensor': ''}bfloat16 %input426|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29066 of IO tensor {'CrossPassTensor': ''}bfloat16 %input427|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29067 of IO tensor {'CrossPassTensor': ''}bfloat16 %input428|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19363 of IO tensor {'CrossPassTensor': ''}bfloat16 %input429(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(249, 'AG3193'), (65, 'AG3189'), (144, 'AG3188'), (393, 'AG3192'), (442, 'AG3191')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29077 of IO tensor {'CrossPassTensor': ''}bfloat16 %input430|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29070 of IO tensor {'CrossPassTensor': ''}bfloat16 %input431|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29074 of IO tensor {'CrossPassTensor': ''}bfloat16 %input433(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29071 of IO tensor {'CrossPassTensor': ''}bfloat16 %input435(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29082 of IO tensor {'CrossPassTensor': ''}bfloat16 %input436(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(145, 'AG3196'), (394, 'AG3194'), (250, 'AG3195')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29081 of IO tensor {'CrossPassTensor': ''}bfloat16 %input437|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29079 of IO tensor {'CrossPassTensor': ''}bfloat16 %input438|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29080 of IO tensor {'CrossPassTensor': ''}bfloat16 %input439|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19614 of IO tensor {'CrossPassTensor': ''}bfloat16 %input440(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(252, 'AG3205'), (67, 'AG3201'), (146, 'AG3200'), (397, 'AG3204'), (443, 'AG3203')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29090 of IO tensor {'CrossPassTensor': ''}bfloat16 %input441|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29083 of IO tensor {'CrossPassTensor': ''}bfloat16 %input442|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29087 of IO tensor {'CrossPassTensor': ''}bfloat16 %input444(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29084 of IO tensor {'CrossPassTensor': ''}bfloat16 %input446(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29095 of IO tensor {'CrossPassTensor': ''}bfloat16 %input447(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(147, 'AG3208'), (398, 'AG3206'), (253, 'AG3207')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29094 of IO tensor {'CrossPassTensor': ''}bfloat16 %input448|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29092 of IO tensor {'CrossPassTensor': ''}bfloat16 %input449|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29093 of IO tensor {'CrossPassTensor': ''}bfloat16 %input450|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19865 of IO tensor {'CrossPassTensor': ''}bfloat16 %input451(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(255, 'AG3217'), (69, 'AG3213'), (148, 'AG3212'), (401, 'AG3216'), (444, 'AG3215')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29103 of IO tensor {'CrossPassTensor': ''}bfloat16 %input452|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29096 of IO tensor {'CrossPassTensor': ''}bfloat16 %input453|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29100 of IO tensor {'CrossPassTensor': ''}bfloat16 %input455(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29097 of IO tensor {'CrossPassTensor': ''}bfloat16 %input457(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29108 of IO tensor {'CrossPassTensor': ''}bfloat16 %input458(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(149, 'AG3220'), (402, 'AG3218'), (256, 'AG3219')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29107 of IO tensor {'CrossPassTensor': ''}bfloat16 %input459|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29105 of IO tensor {'CrossPassTensor': ''}bfloat16 %input460|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29106 of IO tensor {'CrossPassTensor': ''}bfloat16 %input461|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 20116 of IO tensor {'CrossPassTensor': ''}bfloat16 %input462(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(258, 'AG3229'), (71, 'AG3225'), (150, 'AG3224'), (405, 'AG3228'), (445, 'AG3227')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29116 of IO tensor {'CrossPassTensor': ''}bfloat16 %input463|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29109 of IO tensor {'CrossPassTensor': ''}bfloat16 %input464|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29113 of IO tensor {'CrossPassTensor': ''}bfloat16 %input466(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29110 of IO tensor {'CrossPassTensor': ''}bfloat16 %input468(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29121 of IO tensor {'CrossPassTensor': ''}bfloat16 %input469(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(151, 'AG3232'), (406, 'AG3230'), (259, 'AG3231')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29120 of IO tensor {'CrossPassTensor': ''}bfloat16 %input470|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29118 of IO tensor {'CrossPassTensor': ''}bfloat16 %input471|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29119 of IO tensor {'CrossPassTensor': ''}bfloat16 %input472|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 29122 of IO tensor {'CrossPassTensor': ''}bfloat16 %input474|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 2.166 seconds +2025-08-07T13:54:53Z INFO 48510 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-08-07T13:54:54Z INFO 48510 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-08-07T13:54:54Z INFO 48510 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.241 seconds +2025-08-07T13:54:54Z INFO 48510 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-08-07T13:54:54Z INFO 48510 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-08-07T13:54:54Z INFO 48510 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.317 seconds +2025-08-07T13:54:54Z INFO 48510 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-08-07T13:54:54Z INFO 48510 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-08-07T13:54:54Z INFO 48510 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.055 seconds +2025-08-07T13:54:54Z INFO 48510 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-08-07T13:54:54Z INFO 48510 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-08-07T13:54:54Z INFO 48510 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.017 seconds +2025-08-07T13:54:54Z INFO 48510 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-08-07T13:54:56Z INFO 48510 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-08-07T13:54:56Z INFO 48510 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 2.381 seconds +2025-08-07T13:54:56Z INFO 48510 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 5.205 seconds +2025-08-07T13:54:56Z INFO 48510 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-08-07T13:54:57Z INFO 48510 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-08-07T13:54:57Z INFO 48510 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 1.020 seconds +2025-08-07T13:54:57Z INFO 48510 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-08-07T13:54:58Z INFO 48510 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-08-07T13:54:58Z INFO 48510 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.097 seconds +2025-08-07T13:54:58Z INFO 48510 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 1.097 seconds +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 43.334 seconds +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 19008: transpose_128x128 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 19008: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.398 seconds +2025-08-07T13:54:59Z INFO 48510 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:55:00Z INFO 48510 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:55:00Z INFO 48510 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.456 seconds +2025-08-07T13:55:00Z INFO 48510 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-08-07T13:55:01Z INFO 48510 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-08-07T13:55:01Z INFO 48510 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 1.784 seconds +2025-08-07T13:55:01Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:55:02Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:55:02Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.273 seconds +2025-08-07T13:55:02Z INFO 48510 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:55:02Z INFO 48510 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:55:02Z INFO 48510 [sg0000/Tensorizer/LICM]: LICM finished after 0.097 seconds +2025-08-07T13:55:02Z INFO 48510 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-08-07T13:55:02Z INFO 48510 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-08-07T13:55:02Z INFO 48510 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.048 seconds +2025-08-07T13:55:02Z INFO 48510 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:55:02Z INFO 48510 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:55:02Z INFO 48510 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.140 seconds +2025-08-07T13:55:02Z INFO 48510 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:55:02Z INFO 48510 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:55:02Z INFO 48510 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.133 seconds +2025-08-07T13:55:02Z INFO 48510 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 2.022 seconds +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 19008: transpose_128x128 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 19008: matmul_128x128x1 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.076 seconds +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.329 seconds +2025-08-07T13:55:04Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12250 | hlo_id: 12250 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12235 | hlo_id: 12235 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12278 | hlo_id: 12278 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12263 | hlo_id: 12263 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12306 | hlo_id: 12306 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12291 | hlo_id: 12291 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12334 | hlo_id: 12334 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12319 | hlo_id: 12319 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12362 | hlo_id: 12362 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12347 | hlo_id: 12347 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12390 | hlo_id: 12390 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12375 | hlo_id: 12375 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12418 | hlo_id: 12418 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12403 | hlo_id: 12403 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12446 | hlo_id: 12446 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12431 | hlo_id: 12431 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12474 | hlo_id: 12474 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12459 | hlo_id: 12459 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12502 | hlo_id: 12502 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12487 | hlo_id: 12487 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12530 | hlo_id: 12530 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12515 | hlo_id: 12515 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12558 | hlo_id: 12558 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12543 | hlo_id: 12543 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12586 | hlo_id: 12586 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12571 | hlo_id: 12571 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12614 | hlo_id: 12614 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12599 | hlo_id: 12599 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12642 | hlo_id: 12642 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12627 | hlo_id: 12627 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12670 | hlo_id: 12670 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12655 | hlo_id: 12655 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12698 | hlo_id: 12698 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12683 | hlo_id: 12683 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12726 | hlo_id: 12726 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12711 | hlo_id: 12711 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12754 | hlo_id: 12754 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12739 | hlo_id: 12739 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12782 | hlo_id: 12782 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12767 | hlo_id: 12767 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12810 | hlo_id: 12810 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12795 | hlo_id: 12795 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12838 | hlo_id: 12838 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12823 | hlo_id: 12823 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12866 | hlo_id: 12866 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12851 | hlo_id: 12851 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12894 | hlo_id: 12894 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12879 | hlo_id: 12879 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12922 | hlo_id: 12922 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12907 | hlo_id: 12907 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12950 | hlo_id: 12950 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12935 | hlo_id: 12935 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12978 | hlo_id: 12978 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12963 | hlo_id: 12963 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13006 | hlo_id: 13006 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12991 | hlo_id: 12991 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13034 | hlo_id: 13034 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13019 | hlo_id: 13019 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13062 | hlo_id: 13062 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13047 | hlo_id: 13047 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13090 | hlo_id: 13090 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13075 | hlo_id: 13075 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13118 | hlo_id: 13118 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13103 | hlo_id: 13103 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13146 | hlo_id: 13146 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13131 | hlo_id: 13131 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13174 | hlo_id: 13174 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13159 | hlo_id: 13159 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13202 | hlo_id: 13202 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13187 | hlo_id: 13187 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13230 | hlo_id: 13230 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13215 | hlo_id: 13215 | +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.417 seconds +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.343 seconds +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.065 seconds +2025-08-07T13:55:05Z INFO 48510 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:55:06Z INFO 48510 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:55:06Z INFO 48510 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.185 seconds +2025-08-07T13:55:06Z INFO 48510 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-08-07T13:55:06Z INFO 48510 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-08-07T13:55:06Z INFO 48510 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.062 seconds +2025-08-07T13:55:06Z INFO 48510 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-08-07T13:55:06Z INFO 48510 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-08-07T13:55:06Z INFO 48510 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.022 seconds +2025-08-07T13:55:06Z INFO 48510 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:55:06Z INFO 48510 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:55:06Z INFO 48510 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.081 seconds +2025-08-07T13:55:06Z INFO 48510 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:55:06Z INFO 48510 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:55:06Z INFO 48510 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.195 seconds +2025-08-07T13:55:06Z INFO 48510 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-08-07T13:55:07Z INFO 48510 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-08-07T13:55:07Z INFO 48510 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 1.300 seconds +2025-08-07T13:55:07Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:55:08Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:55:08Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.352 seconds +2025-08-07T13:55:08Z INFO 48510 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-08-07T13:55:08Z INFO 48510 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-08-07T13:55:08Z INFO 48510 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.232 seconds +2025-08-07T13:55:08Z INFO 48510 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:55:08Z INFO 48510 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:55:08Z INFO 48510 [sg0000/Tensorizer/LICM]: LICM finished after 0.116 seconds +2025-08-07T13:55:08Z INFO 48510 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 1.664 seconds +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.003 seconds +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.026 seconds +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.034 seconds +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=True) +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.162 seconds +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.052 seconds +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.059 seconds +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.517 seconds +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.051 seconds +2025-08-07T13:55:10Z INFO 48510 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:55:11Z INFO 48510 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:55:11Z INFO 48510 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.286 seconds +2025-08-07T13:55:11Z INFO 48510 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:55:11Z INFO 48510 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:55:11Z INFO 48510 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.286 seconds +2025-08-07T13:55:11Z INFO 48510 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 1.507 seconds +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.109 seconds +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.220 seconds +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.038 seconds +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.049 seconds +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=True) +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.082 seconds +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.019 seconds +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.047 seconds +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.302 seconds +2025-08-07T13:55:13Z INFO 48510 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-08-07T13:55:15Z INFO 48510 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-08-07T13:55:15Z INFO 48510 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 1.224 seconds +2025-08-07T13:55:15Z INFO 48510 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:55:15Z INFO 48510 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:55:15Z INFO 48510 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.340 seconds +2025-08-07T13:55:15Z INFO 48510 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-08-07T13:55:15Z INFO 48510 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-08-07T13:55:15Z INFO 48510 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.025 seconds +2025-08-07T13:55:15Z INFO 48510 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-08-07T13:55:15Z INFO 48510 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-08-07T13:55:15Z INFO 48510 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.347 seconds +2025-08-07T13:55:15Z INFO 48510 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:55:16Z INFO 48510 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:55:16Z INFO 48510 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.203 seconds +2025-08-07T13:55:16Z INFO 48510 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:55:16Z INFO 48510 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-08-07T13:55:16Z INFO 48510 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.394 seconds +2025-08-07T13:55:16Z INFO 48510 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:55:16Z INFO 48510 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:55:16Z INFO 48510 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.053 seconds +2025-08-07T13:55:16Z INFO 48510 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:55:17Z INFO 48510 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-08-07T13:55:17Z INFO 48510 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.502 seconds +2025-08-07T13:55:17Z INFO 48510 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-08-07T13:55:17Z INFO 48510 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-08-07T13:55:17Z INFO 48510 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.042 seconds +2025-08-07T13:55:17Z INFO 48510 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:55:17Z INFO 48510 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-08-07T13:55:17Z INFO 48510 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.387 seconds +2025-08-07T13:55:17Z INFO 48510 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:55:19Z INFO 48510 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-08-07T13:55:19Z INFO 48510 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 1.507 seconds +2025-08-07T13:55:19Z INFO 48510 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-08-07T13:55:19Z INFO 48510 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-08-07T13:55:19Z INFO 48510 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.058 seconds +2025-08-07T13:55:19Z INFO 48510 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:55:19Z INFO 48510 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-08-07T13:55:19Z INFO 48510 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.204 seconds +2025-08-07T13:55:19Z INFO 48510 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:55:19Z INFO 48510 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:55:19Z INFO 48510 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.293 seconds +2025-08-07T13:55:19Z INFO 48510 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:55:20Z INFO 48510 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-08-07T13:55:20Z INFO 48510 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 1.177 seconds +2025-08-07T13:55:20Z INFO 48510 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:55:20Z INFO 48510 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:55:20Z INFO 48510 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.060 seconds +2025-08-07T13:55:20Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:55:22Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:55:22Z INFO 48510 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 1.415 seconds +2025-08-07T13:55:22Z INFO 48510 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-08-07T13:55:22Z INFO 48510 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-08-07T13:55:22Z INFO 48510 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.160 seconds +2025-08-07T13:55:22Z INFO 48510 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-08-07T13:55:22Z INFO 48510 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-08-07T13:55:22Z INFO 48510 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.142 seconds +2025-08-07T13:55:22Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:55:22Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:55:22Z INFO 48510 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.188 seconds +2025-08-07T13:55:22Z INFO 48510 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:55:22Z INFO 48510 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:55:22Z INFO 48510 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.096 seconds +2025-08-07T13:55:22Z INFO 48510 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:55:24Z INFO 48510 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=True) +2025-08-07T13:55:24Z INFO 48510 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 1.374 seconds +2025-08-07T13:55:24Z INFO 48510 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:55:24Z INFO 48510 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:55:24Z INFO 48510 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.037 seconds +2025-08-07T13:55:24Z INFO 48510 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:55:24Z INFO 48510 [sg0000/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:55:24Z INFO 48510 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.162 seconds +2025-08-07T13:55:24Z INFO 48510 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:55:28Z INFO 48510 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:55:28Z INFO 48510 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 4.159 seconds +2025-08-07T13:55:28Z INFO 48510 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:55:28Z INFO 48510 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-08-07T13:55:28Z INFO 48510 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.330 seconds +2025-08-07T13:55:28Z INFO 48510 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.202 seconds +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.070 seconds +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 2.705ms (594.000MiB, est bw: 230.258GB/s, 7.675% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (594, 128, 4096) %'36626.52149'[i4422_0,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (75968, 4096) %'input473'[128i4422_0+i0.128,i1.4096] # id=52148, src_id=None, , instances=594 # dl = tensor_op_name: input473_pftranspose_36626 | hlo_id: 20004 | if -128i4422_0-i0.128+75967 >= 0 [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.657% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input84_local_38775'[i148_0,i147_0_0_38779,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input84'[i148_0,i147_0_0_38779,i0.128,i1.3072] # id=43014, src_id=None, , instances=64 # dl = tensor_op_name: _dot.395 | hlo_id: 15976 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.657% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input95_local_38851'[i270_0,i269_0_0_38855,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input95'[i270_0,i269_0_0_38855,i0.128,i1.3072] # id=43190, src_id=None, , instances=64 # dl = tensor_op_name: _dot.727 | hlo_id: 16091 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.657% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input106_local_38927'[i392_0,i391_0_0_38931,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input106'[i392_0,i391_0_0_38931,i0.128,i1.3072] # id=43366, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1059 | hlo_id: 16206 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.657% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input117_local_39003'[i514_0,i513_0_0_39007,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input117'[i514_0,i513_0_0_39007,i0.128,i1.3072] # id=43542, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1391 | hlo_id: 16321 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.657% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input128_local_39079'[i636_0,i635_0_0_39083,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input128'[i636_0,i635_0_0_39083,i0.128,i1.3072] # id=43718, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1723 | hlo_id: 16436 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.657% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input139_local_39155'[i758_0,i757_0_0_39159,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input139'[i758_0,i757_0_0_39159,i0.128,i1.3072] # id=43894, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2055 | hlo_id: 16551 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.657% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input150_local_39231'[i880_0,i879_0_0_39235,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input150'[i880_0,i879_0_0_39235,i0.128,i1.3072] # id=44070, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2387 | hlo_id: 16666 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.657% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input161_local_39307'[i1002_0,i1001_0_0_39311,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input161'[i1002_0,i1001_0_0_39311,i0.128,i1.3072] # id=44246, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2719 | hlo_id: 16781 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.657% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input172_local_39383'[i1124_0,i1123_0_0_39387,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input172'[i1124_0,i1123_0_0_39387,i0.128,i1.3072] # id=44422, src_id=None, , instances=64 # dl = tensor_op_name: _dot.3051 | hlo_id: 16896 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.092 seconds +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.001 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.004 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.006 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:55:29Z INFO 48510 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.406 seconds +2025-08-07T13:55:29Z INFO 48510 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:55:30Z INFO 48510 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:55:30Z INFO 48510 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.483 seconds +2025-08-07T13:55:30Z INFO 48510 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-08-07T13:55:30Z WARNING 48510 [sg0000/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 91.40 percent of all matmul computation +2025-08-07T13:55:30Z INFO 48510 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48510 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.143 seconds +2025-08-07T13:55:30Z INFO 48510 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-08-07T13:55:30Z INFO 48510 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-08-07T13:55:30Z INFO 48510 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.369 seconds +2025-08-07T13:55:30Z INFO 48510 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-08-07T13:55:30Z INFO 48510 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48510 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.104 seconds +2025-08-07T13:55:30Z INFO 48510 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-08-07T13:55:31Z INFO 48510 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48510 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.257 seconds +2025-08-07T13:55:31Z INFO 48510 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-08-07T13:55:31Z INFO 48510 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48510 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.002 seconds +2025-08-07T13:55:31Z INFO 48510 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-08-07T13:55:33Z INFO 48510 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-08-07T13:55:33Z INFO 48510 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 2.579 seconds +2025-08-07T13:55:35Z INFO 48510 [Tensorizer]: BirCodeGen estimate #instances=325046 in sg0000 +2025-08-07T13:55:35Z INFO 48510 [Tensorizer]: IR signature: a039d7e42d76d7b80e822dd8e8e2f6399bc300bfd3a44d875bc119722a03f8c9 for nc00/sg0000/TensorizerBIR +2025-08-07T13:55:35Z INFO 48510 [Tensorizer]: Weights total number of bytes: 4952584 +2025-08-07T13:55:35Z INFO 48510 [Tensorizer]: Successfully built model. +2025-08-07T13:55:35Z USER 48510 [root/Tensorizer/Tensorizer]: Tensorizer finished after 103.194 seconds +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: End tensorization +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input0 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input1 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input2 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input3 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input4 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input5 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input6 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input7 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input8 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input9 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input10 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input11 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input12 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input13 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input14 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input15 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input16 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input17 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input18 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input19 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input20 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input21 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input22 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input23 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input24 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input25 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input26 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input27 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input28 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input29 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input30 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input31 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input32 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input33 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input34 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input35 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input36 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input37 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input38 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input39 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input40 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input41 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input42 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input43 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input44 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input45 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input46 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input47 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input48 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input49 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input50 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input51 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input52 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input53 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input54 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input55 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input56 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input57 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input58 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input59 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input60 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input61 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input62 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input63 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input64 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input65 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input66 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input67 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input68 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input69 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input70 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input71 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input72 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input73 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input74 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input75 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input76 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input77 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input78 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input79 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input80 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input81 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input82 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input83 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input84 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input85 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input86 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input87 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input88 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input89 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input90 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input91 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input92 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input93 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input94 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input95 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input96 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input97 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input98 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input99 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input100 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input101 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input102 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input103 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input104 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input105 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input106 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input107 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input108 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input109 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input110 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input111 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input112 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input113 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input114 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input115 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input116 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input117 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input118 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input119 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input120 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input121 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input122 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input123 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input124 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input125 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input126 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input127 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input128 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input129 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input130 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input131 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input132 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input133 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input134 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input135 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input136 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input137 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input138 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input139 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input140 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input141 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input142 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input143 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input144 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input145 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input146 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input147 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input148 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input149 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input150 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input151 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input152 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input153 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input154 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input155 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input156 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input157 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input158 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input159 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input160 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input161 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input162 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input163 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input164 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input165 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input166 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input167 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input168 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input169 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input170 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input171 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input172 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input173 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input174 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input175 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input176 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input177 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input178 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input179 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input180 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input181 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input182 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input183 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input184 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input185 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input186 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input187 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input188 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input189 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input190 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input191 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input192 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input193 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input194 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input195 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input196 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input197 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input198 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input199 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input200 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input201 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input202 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input203 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input204 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input205 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input206 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input207 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input208 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input209 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input210 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input211 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input212 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input213 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input214 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input215 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input216 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input217 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input218 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input219 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input220 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input221 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input222 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input223 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input224 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input225 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input226 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input227 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input228 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input229 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input230 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input231 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input232 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input233 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input234 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input235 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input236 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input237 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input238 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input239 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input240 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input241 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input242 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input243 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input244 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input245 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input246 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input247 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input248 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input249 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input250 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input251 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input252 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input253 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input254 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input255 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input256 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input257 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input258 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input259 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input260 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input261 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input262 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input263 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input264 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input265 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input266 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input267 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input268 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input269 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input270 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input271 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input272 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input273 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input274 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input275 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input276 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input277 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input278 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input279 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input280 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input281 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input282 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input283 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input284 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input285 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input286 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input287 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input288 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input289 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input290 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input291 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input292 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input293 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input294 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input295 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input296 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input297 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input298 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input299 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input300 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input301 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input302 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input303 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input304 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input305 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input306 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input307 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input308 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input309 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input310 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input311 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input312 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input313 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input314 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input315 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input316 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input317 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input318 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input319 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input320 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input321 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input322 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input323 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input324 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input325 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input326 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input327 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input328 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input329 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input330 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input331 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input332 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input333 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input334 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input335 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input336 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input337 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input338 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input339 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input340 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input341 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input342 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input343 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input344 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input345 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input346 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input347 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input348 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input349 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input350 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input351 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input352 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input353 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input354 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input355 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input356 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input357 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input358 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input359 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input360 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input361 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input362 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input363 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input364 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input365 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input366 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input367 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input368 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input369 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input370 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input371 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input372 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input373 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input374 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input375 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input376 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input377 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input378 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input379 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input380 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input381 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input382 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input383 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input384 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input385 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input386 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input387 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input388 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input389 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input390 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input391 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input392 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input393 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input394 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input395 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input396 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input397 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input398 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input399 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input400 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input401 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input402 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input403 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input404 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input405 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input406 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input407 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input408 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input409 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input410 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input411 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input412 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input413 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input414 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input415 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input416 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input417 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input418 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input419 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input420 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input421 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input422 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input423 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input424 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input425 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input426 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input427 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input428 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input429 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input430 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input431 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input432 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input433 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input434 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input435 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input436 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input437 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input438 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input439 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input440 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input441 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input442 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input443 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input444 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input445 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input446 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input447 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input448 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input449 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input450 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input451 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input452 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input453 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input454 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input455 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input456 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input457 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input458 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input459 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input460 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input461 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input462 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input463 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input464 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input465 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input466 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input467 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input468 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input469 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input470 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input471 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input472 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input473 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Network input: input474 +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: wrote bir.json +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: wrote tensor_map.json +2025-08-07T13:55:35Z INFO 48510 [job.Frontend.0]: Job #0 finished +2025-08-07T13:55:35Z INFO 48510 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-08-07T13:55:35Z INFO 48510 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-08-07T13:55:35Z INFO 48510 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-08-07T13:55:35Z INFO 48510 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-08-07T13:55:35Z INFO 48510 [job.WalrusDriver.0]: BackendDriver has 1 states with 1 core LNC +2025-08-07T13:55:35Z INFO 48510 [job.WalrusDriver.0]: BackendDriver: no partitions found. Switching to flat flow. +2025-08-07T13:55:35Z INFO 48510 [job.WalrusDriver.0]: Job WalrusDriver len(in_states) 1 +2025-08-07T13:55:35Z INFO 48510 [job.WalrusDriver.0]: Processing input #0 +2025-08-07T13:55:35Z INFO 48510 [job.WalrusDriver.0]: BackendDriver in_state.num_states 1 with 1 core LNC +2025-08-07T13:55:35Z INFO 48510 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/token_generation_model/_tp0_bk2/log-neuron-cc.txt --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-color-rotation --max-load-lower-bound 0.14 --mm-reorder-opt --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json --unified-backend-and-legacy-codegen --tensor-map tensor_map.json --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --dge-levels io,scalar_dynamic_offset,vector_dynamic_offsets --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.neff +2025-08-07T13:55:35Z INFO 48510 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/token_generation_model/_tp0_bk2/neuronxcc-gh4n9bnf/sg00 +2025-08-07T13:55:35Z INFO 48510 [job.WalrusDriver.0]: propagate_exit=True +2025-08-07T13:55:35Z INFO 48510 [job.WalrusDriver.0]: use_logger=False +2025-08-07T13:55:35Z INFO 48510 [job.WalrusDriver.0]: expose_stderr=True +2025-08-07T13:55:35Z INFO 49655 [Logging]: Logging to ../../log-neuron-cc.txt at level 'INFO' +2025-08-07T13:55:35Z INFO 49655 [BackendDriver]: max_allowed_parallelism=128 +2025-08-07T13:55:36Z INFO 49655 [BackendDriver]: Backend driver mtBackend: false numModules: 1 Cwd: "/home/ubuntu/qwen3/token_generation_model/_tp0_bk2/neuronxcc-gh4n9bnf/sg00" +2025-08-07T13:55:36Z INFO 49655 [BackendDriver]: DynamicDMA is enabled +2025-08-07T13:55:36Z INFO 49655 [BackendDriver]: DynamicDMA levels being enabled: io, scalar_dynamic_offset, vector_dynamic_offsets, +2025-08-07T13:55:36Z USER 49655 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:55:36Z INFO 49655 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=7374 blocks=1 instructions=7478 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:36Z USER 49655 [ModuleForkPass]: Running do_nothing +2025-08-07T13:55:36Z INFO 49655 [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=7374 blocks=1 instructions=7478 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:36Z USER 49655 [ModuleForkPass]: do_nothing finished after 0.001 seconds +2025-08-07T13:55:36Z INFO 49655 [ModuleForkPass]: curr_vmrss: 218mb, ru_maxrss: 700mb (delta=0mb) +2025-08-07T13:55:36Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 7374 memory location(s), 1 block(s), and 7478 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:55:36Z USER 49655 [ModuleForkPass]: Running birverifier +2025-08-07T13:55:36Z INFO 49655 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=7374 blocks=1 instructions=7478 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:36Z WARNING 49655 [birverifier::InstVisitor]: (module) Non - output memory location with no reader: {convert.345.62545}@SB<0,0>(1x2)#Internal DebugInfo: +2025-08-07T13:55:36Z USER 49655 [ModuleForkPass]: birverifier finished after 0.235 seconds +2025-08-07T13:55:36Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1013mb, ru_maxrss: 1013mb (delta=313mb) +2025-08-07T13:55:36Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 7374 memory location(s), 1 block(s), and 7478 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:55:36Z USER 49655 [BackendPassManager]: mod_parallel_pass finished after 0.241 seconds +2025-08-07T13:55:36Z INFO 49655 [BackendPassManager]: curr_vmrss: 1005mb, ru_maxrss: 1013mb (delta=313mb) +2025-08-07T13:55:36Z INFO 49655 [BackendPassManager]: Output has 1 module(s), 1 function(s), 7374 memory location(s), 1 block(s), and 7478 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:55:36Z USER 49655 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:55:36Z INFO 49655 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=7374 blocks=1 instructions=7478 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:36Z USER 49655 [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:55:36Z INFO 49655 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=7374 blocks=1 instructions=7478 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:36Z USER 49655 [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:55:36Z INFO 49655 [SubgraphForkPass]: curr_vmrss: 1005mb, ru_maxrss: 1013mb (delta=0mb) +2025-08-07T13:55:36Z INFO 49655 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 7374 memory location(s), 1 block(s), and 7478 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:55:36Z USER 49655 [BackendPassManager]: subgraph_parallel_pass finished after 0.002 seconds +2025-08-07T13:55:36Z INFO 49655 [BackendPassManager]: curr_vmrss: 1005mb, ru_maxrss: 1013mb (delta=0mb) +2025-08-07T13:55:36Z INFO 49655 [BackendPassManager]: Output has 1 module(s), 1 function(s), 7374 memory location(s), 1 block(s), and 7478 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:55:36Z USER 49655 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:55:36Z INFO 49655 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=7374 blocks=1 instructions=7478 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:36Z USER 49655 [ModuleForkPass]: Running expand_replication +2025-08-07T13:55:36Z INFO 49655 [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=7374 blocks=1 instructions=7478 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:36Z INFO 49655 [ExpandReplication]: Found 0 replicated matmults +2025-08-07T13:55:36Z USER 49655 [ModuleForkPass]: expand_replication finished after 0.001 seconds +2025-08-07T13:55:36Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1005mb, ru_maxrss: 1013mb (delta=0mb) +2025-08-07T13:55:36Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 7374 memory location(s), 1 block(s), and 7478 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:55:36Z USER 49655 [ModuleForkPass]: Running unroll +2025-08-07T13:55:36Z INFO 49655 [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=7374 blocks=1 instructions=7478 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:36Z INFO 49655 [Unroll]: INFO (Unroll) Start unrolling at Thu Aug 7 13:55:36 2025 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: INFO (Unroll) DONE unrolling Thu Aug 7 13:55:36 2025 + +2025-08-07T13:55:39Z INFO 49655 [Unroll]: sg0000 Instruction count after Unroll: +2025-08-07T13:55:39Z INFO 49655 [Unroll]: Total count: 279229 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: Matmult: 253176 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: GenericCopy: 11675 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: Load: 8476 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: TensorTensor: 1341 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: TensorScalarPtr: 1338 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: Save: 682 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: Activation: 545 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: StreamShuffle: 510 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: Memset: 300 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: Max: 224 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: MaxIndex: 224 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: MatchReplace: 217 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: TensorReduce: 115 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: CollectiveCompute: 75 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: Reciprocal: 75 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: DMACopy: 74 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: Iota: 73 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: Select: 38 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: StreamTranspose: 36 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: Gather: 35 +2025-08-07T13:55:39Z INFO 49655 [Unroll]: Unrolled DGE count with Dynamic AP: 73 +2025-08-07T13:55:39Z USER 49655 [ModuleForkPass]: unroll finished after 2.671 seconds +2025-08-07T13:55:39Z INFO 49655 [ModuleForkPass]: curr_vmrss: 2402mb, ru_maxrss: 2402mb (delta=1389mb) +2025-08-07T13:55:39Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28961 memory location(s), 1 block(s), and 279229 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [BackendPassManager]: mod_parallel_pass finished after 2.731 seconds +2025-08-07T13:55:39Z INFO 49655 [BackendPassManager]: curr_vmrss: 1495mb, ru_maxrss: 2402mb (delta=1389mb) +2025-08-07T13:55:39Z INFO 49655 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28961 memory location(s), 1 block(s), and 279229 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:55:39Z INFO 49655 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=28961 blocks=1 instructions=279229 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:55:39Z INFO 49655 [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=28961 blocks=1 instructions=279229 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z INFO 49655 [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:55:39Z INFO 49655 [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:39Z INFO 49655 [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:39Z INFO 49655 [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:55:39Z USER 49655 [SubgraphForkPass]: dead_code_elim finished after 0.297 seconds +2025-08-07T13:55:39Z INFO 49655 [SubgraphForkPass]: curr_vmrss: 1503mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:39Z INFO 49655 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 28340 memory location(s), 1 block(s), and 279228 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [BackendPassManager]: subgraph_parallel_pass finished after 0.301 seconds +2025-08-07T13:55:39Z INFO 49655 [BackendPassManager]: curr_vmrss: 1503mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:39Z INFO 49655 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28340 memory location(s), 1 block(s), and 279228 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:55:39Z INFO 49655 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=28340 blocks=1 instructions=279228 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [ModuleForkPass]: Running birverifier +2025-08-07T13:55:39Z INFO 49655 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=28340 blocks=1 instructions=279228 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [ModuleForkPass]: birverifier finished after 0.266 seconds +2025-08-07T13:55:39Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1548mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:39Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28340 memory location(s), 1 block(s), and 279228 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [BackendPassManager]: mod_parallel_pass finished after 0.271 seconds +2025-08-07T13:55:39Z INFO 49655 [BackendPassManager]: curr_vmrss: 1548mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:39Z INFO 49655 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28340 memory location(s), 1 block(s), and 279228 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:55:39Z INFO 49655 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=28340 blocks=1 instructions=279228 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:55:39Z INFO 49655 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=28340 blocks=1 instructions=279228 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:55:39Z INFO 49655 [SubgraphForkPass]: curr_vmrss: 1548mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:39Z INFO 49655 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 28340 memory location(s), 1 block(s), and 279228 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [BackendPassManager]: subgraph_parallel_pass finished after 0.004 seconds +2025-08-07T13:55:39Z INFO 49655 [BackendPassManager]: curr_vmrss: 1548mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:39Z INFO 49655 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28340 memory location(s), 1 block(s), and 279228 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:55:39Z INFO 49655 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=28340 blocks=1 instructions=279228 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [ModuleForkPass]: Running instruction_reorder +2025-08-07T13:55:39Z INFO 49655 [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=28340 blocks=1 instructions=279228 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [ModuleForkPass]: instruction_reorder finished after 0.050 seconds +2025-08-07T13:55:39Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1548mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:39Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28340 memory location(s), 1 block(s), and 279228 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [ModuleForkPass]: Running psum_legalization +2025-08-07T13:55:39Z INFO 49655 [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=28340 blocks=1 instructions=279228 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [ModuleForkPass]: psum_legalization finished after 0.035 seconds +2025-08-07T13:55:39Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1548mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:39Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28340 memory location(s), 1 block(s), and 279228 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [ModuleForkPass]: Running legalize_cce_dma +2025-08-07T13:55:39Z INFO 49655 [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=28340 blocks=1 instructions=279228 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [ModuleForkPass]: legalize_cce_dma finished after 0.030 seconds +2025-08-07T13:55:39Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1548mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:39Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28340 memory location(s), 1 block(s), and 279228 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [ModuleForkPass]: Running error_injector +2025-08-07T13:55:39Z INFO 49655 [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=28340 blocks=1 instructions=279228 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z WARNING 49655 [ErrorInjector]: Unrecognized injected error value "0" +2025-08-07T13:55:39Z USER 49655 [ModuleForkPass]: error_injector finished after 0.002 seconds +2025-08-07T13:55:39Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1548mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:39Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28340 memory location(s), 1 block(s), and 279228 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z USER 49655 [ModuleForkPass]: Running vn_splitter +2025-08-07T13:55:39Z INFO 49655 [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=28340 blocks=1 instructions=279228 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:39Z INFO 49655 [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 14 +2025-08-07T13:55:39Z INFO 49655 [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-08-07T13:55:40Z INFO 49655 [ShrinkDN]: INFO (ShrinkDN): Shrunk 3 nodes. Total savings 14460 bytes/partition +2025-08-07T13:55:40Z INFO 49655 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-08-07T13:55:40Z INFO 49655 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-08-07T13:55:40Z INFO 49655 [VNSplitterPass]: INFO (VNSplitter) Time: 0.002 seconds +2025-08-07T13:55:40Z INFO 49655 [VNSplitterPass]: INFO (VerticalFusion) Time: 0.042 seconds +2025-08-07T13:55:40Z INFO 49655 [VNSplitterPass]: INFO (ShrinkDN) Time: 0.045 seconds +2025-08-07T13:55:40Z USER 49655 [ModuleForkPass]: vn_splitter finished after 0.138 seconds +2025-08-07T13:55:40Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1552mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:40Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28340 memory location(s), 1 block(s), and 279228 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:40Z USER 49655 [ModuleForkPass]: Running constant_propagate +2025-08-07T13:55:40Z INFO 49655 [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=28340 blocks=1 instructions=279228 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:40Z INFO 49655 [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-08-07T13:55:40Z INFO 49655 [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:55:40Z INFO 49655 [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:40Z INFO 49655 [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:40Z INFO 49655 [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:55:40Z INFO 49655 [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-08-07T13:55:40Z INFO 49655 [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:55:40Z INFO 49655 [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:40Z INFO 49655 [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:40Z INFO 49655 [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:55:40Z USER 49655 [ModuleForkPass]: constant_propagate finished after 0.628 seconds +2025-08-07T13:55:40Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1555mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:40Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28340 memory location(s), 1 block(s), and 279228 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:40Z USER 49655 [ModuleForkPass]: Running lower_ac +2025-08-07T13:55:40Z INFO 49655 [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=28340 blocks=1 instructions=279228 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:40Z INFO 49655 [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-08-07T13:55:40Z USER 49655 [ModuleForkPass]: lower_ac finished after 0.043 seconds +2025-08-07T13:55:40Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1555mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:40Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28340 memory location(s), 1 block(s), and 279228 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:40Z USER 49655 [ModuleForkPass]: Running input_dma_coalescing +2025-08-07T13:55:40Z INFO 49655 [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=28340 blocks=1 instructions=279228 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:40Z INFO 49655 [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-08-07T13:55:40Z USER 49655 [ModuleForkPass]: input_dma_coalescing finished after 0.087 seconds +2025-08-07T13:55:40Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1555mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:40Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28340 memory location(s), 1 block(s), and 279228 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:40Z USER 49655 [ModuleForkPass]: Running remat_optimization +2025-08-07T13:55:40Z INFO 49655 [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=28340 blocks=1 instructions=279228 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:41Z INFO 49655 [RematOpt]: Removed 0 remat instructions +2025-08-07T13:55:41Z USER 49655 [ModuleForkPass]: remat_optimization finished after 0.152 seconds +2025-08-07T13:55:41Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1557mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28340 memory location(s), 1 block(s), and 279228 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:41Z USER 49655 [ModuleForkPass]: Running early_peephole_opts +2025-08-07T13:55:41Z INFO 49655 [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=28340 blocks=1 instructions=279228 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:41Z INFO 49655 [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-08-07T13:55:41Z INFO 49655 [EarlyPeepholeOpts]: Activation Accumulate: 0 +2025-08-07T13:55:41Z USER 49655 [ModuleForkPass]: early_peephole_opts finished after 0.090 seconds +2025-08-07T13:55:41Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1557mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28340 memory location(s), 1 block(s), and 279228 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:41Z USER 49655 [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-08-07T13:55:41Z INFO 49655 [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=28340 blocks=1 instructions=279228 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:41Z USER 49655 [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.024 seconds +2025-08-07T13:55:41Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1557mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28340 memory location(s), 1 block(s), and 279228 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:41Z USER 49655 [ModuleForkPass]: Running infer_stream_ids +2025-08-07T13:55:41Z INFO 49655 [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=28340 blocks=1 instructions=279228 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:41Z USER 49655 [ModuleForkPass]: infer_stream_ids finished after 0.022 seconds +2025-08-07T13:55:41Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1557mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28340 memory location(s), 1 block(s), and 279228 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:41Z USER 49655 [ModuleForkPass]: Running pre_sched +2025-08-07T13:55:41Z INFO 49655 [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=28340 blocks=1 instructions=279228 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:41Z INFO 49655 [PreSched]: Start PRE scheduling 2 cores: 1 at: Thu Aug 7 13:55:41 2025 +2025-08-07T13:55:41Z INFO 49655 [LayerSpiller]: LayerSpill: Start... +2025-08-07T13:55:41Z INFO 49655 [LayerSpiller]: LayerSpill: Found 72 Splits CCs +2025-08-07T13:55:41Z INFO 49655 [LayerSpiller]: Grouped CCs to 72 clusters. +2025-08-07T13:55:41Z INFO 49655 [LayerSpiller]: LayerSpill: To Spill 60 multi-layer tensors +2025-08-07T13:55:41Z INFO 49655 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-08-07T13:55:41Z INFO 49655 [LayerSpiller]: LayerSpill: Done. +2025-08-07T13:55:41Z INFO 49655 [PreSched]: Start split live ranges Thu Aug 7 13:55:41 2025 +2025-08-07T13:55:41Z INFO 49655 [PreSched]: Num_Splits: 0 +2025-08-07T13:55:41Z INFO 49655 [PreSched]: End split live ranges Thu Aug 7 13:55:41 2025 +2025-08-07T13:55:41Z INFO 49655 [PreSched]: Strt remove redundncies Thu Aug 7 13:55:41 2025 +2025-08-07T13:55:41Z INFO 49655 [PreSched]: remove_redundant_memsets +2025-08-07T13:55:41Z INFO 49655 [PreSched]: remove_redundant_memsets: 0 +2025-08-07T13:55:41Z INFO 49655 [PreSched]: remove_redundant_loads +2025-08-07T13:55:41Z INFO 49655 [PreSched]: remove_redundant_loads: 0 +2025-08-07T13:55:41Z INFO 49655 [PreSched]: End remove redundncies Thu Aug 7 13:55:41 2025 +2025-08-07T13:55:41Z INFO 49655 [PreSched]: Start DCE Thu Aug 7 13:55:41 2025 +2025-08-07T13:55:41Z INFO 49655 [PreSched]: eliminateDeadStore removed 0 instructions +2025-08-07T13:55:41Z INFO 49655 [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:41Z INFO 49655 [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:41Z INFO 49655 [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:55:41Z INFO 49655 [PreSched]: End DCE Thu Aug 7 13:55:41 2025 +2025-08-07T13:55:41Z INFO 49655 [PreSched]: Start build flow dependencies Thu Aug 7 13:55:41 2025 +2025-08-07T13:55:41Z INFO 49655 [build_flow_deps]: Start build fdeps. Invocation: 1Thu Aug 7 13:55:41 2025 +2025-08-07T13:55:41Z INFO 49655 [build_flow_deps]: Allocs: 28460 instructions: 279348 +2025-08-07T13:55:42Z INFO 49655 [build_flow_deps]: Build fdeps inserted 821895 edges +2025-08-07T13:55:42Z INFO 49655 [build_flow_deps]: Done build fdeps 821895 Thu Aug 7 13:55:42 2025 +2025-08-07T13:55:42Z INFO 49655 [PreSched]: End build flow dependencies Thu Aug 7 13:55:42 2025 +2025-08-07T13:55:42Z INFO 49655 [PreSched]: Start remove useless insts Thu Aug 7 13:55:42 2025 +2025-08-07T13:55:42Z INFO 49655 [PreSched]: remove_useless_insts +2025-08-07T13:55:42Z INFO 49655 [PreSched]: remove Useless Instructions: 0 +2025-08-07T13:55:42Z INFO 49655 [PreSched]: End remove useless insts Thu Aug 7 13:55:42 2025 +2025-08-07T13:55:42Z INFO 49655 [PreSched]: Start scratchpad optimization Thu Aug 7 13:55:42 2025 +2025-08-07T13:55:42Z INFO 49655 [PreSched]: End scratchpad optimization Thu Aug 7 13:55:42 2025 +2025-08-07T13:55:42Z INFO 49655 [PreSched]: DONE PRE scheduling Thu Aug 7 13:55:42 2025 +2025-08-07T13:55:42Z USER 49655 [ModuleForkPass]: pre_sched finished after 1.821 seconds +2025-08-07T13:55:42Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1680mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:42Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28460 memory location(s), 1 block(s), and 279348 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:42Z USER 49655 [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:55:42Z INFO 49655 [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=28460 blocks=1 instructions=279348 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:43Z INFO 49655 [TensorCopyElim]: Tensor CP elimination: 1 +2025-08-07T13:55:43Z INFO 49655 [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:55:43Z INFO 49655 [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:43Z INFO 49655 [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:43Z INFO 49655 [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:55:43Z USER 49655 [ModuleForkPass]: tensor_copy_elim finished after 0.476 seconds +2025-08-07T13:55:43Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1680mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:43Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28459 memory location(s), 1 block(s), and 279347 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:43Z USER 49655 [ModuleForkPass]: Running dynamic_dma_setup +2025-08-07T13:55:43Z INFO 49655 [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=28459 blocks=1 instructions=279347 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:43Z USER 49655 [ModuleForkPass]: dynamic_dma_setup finished after 0.002 seconds +2025-08-07T13:55:43Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1680mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:43Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28460 memory location(s), 1 block(s), and 279347 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:43Z USER 49655 [ModuleForkPass]: Running runtime_memory_reservation +2025-08-07T13:55:43Z INFO 49655 [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=28460 blocks=1 instructions=279347 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:43Z USER 49655 [ModuleForkPass]: runtime_memory_reservation finished after 0.001 seconds +2025-08-07T13:55:43Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1680mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:43Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28460 memory location(s), 1 block(s), and 279347 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:43Z USER 49655 [ModuleForkPass]: Running coloring_allocator_psum +2025-08-07T13:55:43Z INFO 49655 [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=28460 blocks=1 instructions=279347 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:43Z INFO 49655 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:55:43Z INFO 49655 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:55:43Z INFO 49655 [PSUM_Allocator]: allocating PSUM +2025-08-07T13:55:43Z INFO 49655 [PSUM_Allocator]: main loop +2025-08-07T13:55:43Z INFO 49655 [PSUM_Allocator]: renumber locations +2025-08-07T13:55:43Z INFO 49655 [PSUM_Allocator]: size = 11886 +2025-08-07T13:55:43Z INFO 49655 [PSUM_Allocator]: build_no_bitmap start +2025-08-07T13:55:43Z INFO 49655 [PSUM_Allocator]: 100% PSUM demand before spilling +2025-08-07T13:55:43Z INFO 49655 [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-08-07T13:55:43Z INFO 49655 [PSUM_Allocator]: found 22249 edges +2025-08-07T13:55:43Z INFO 49655 [PSUM_Allocator]: mean: 3.74373 +2025-08-07T13:55:43Z INFO 49655 [PSUM_Allocator]: median: 2.23653 +2025-08-07T13:55:43Z INFO 49655 [PSUM_Allocator]: adjacency vectors require 177992 bytes +2025-08-07T13:55:43Z INFO 49655 [PSUM_Allocator]: build_no_bitmap done +2025-08-07T13:55:43Z INFO 49655 [PSUM_Allocator]: find costs +2025-08-07T13:55:49Z INFO 49655 [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-08-07T13:55:49Z INFO 49655 [PSUM_Allocator]: simplify interference graph +2025-08-07T13:55:49Z INFO 49655 [PSUM_Allocator]: initialize low and high +2025-08-07T13:55:49Z INFO 49655 [PSUM_Allocator]: lo = 11886 +2025-08-07T13:55:49Z INFO 49655 [PSUM_Allocator]: hi = 0 +2025-08-07T13:55:49Z INFO 49655 [PSUM_Allocator]: inf = 0 +2025-08-07T13:55:49Z INFO 49655 [PSUM_Allocator]: total = 11886 +2025-08-07T13:55:49Z INFO 49655 [PSUM_Allocator]: simplify +2025-08-07T13:55:49Z INFO 49655 [PSUM_Allocator]: new candidates = 0 +2025-08-07T13:55:49Z INFO 49655 [PSUM_Allocator]: select ranges +2025-08-07T13:55:49Z INFO 49655 [PSUM_Allocator]: no more spills +2025-08-07T13:55:49Z INFO 49655 [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-08-07T13:55:49Z INFO 49655 [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-08-07T13:55:49Z INFO 49655 [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-08-07T13:55:49Z USER 49655 [ModuleForkPass]: coloring_allocator_psum finished after 5.798 seconds +2025-08-07T13:55:49Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1683mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:49Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28460 memory location(s), 1 block(s), and 279347 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:49Z USER 49655 [ModuleForkPass]: Running dma_optimization_psum +2025-08-07T13:55:49Z INFO 49655 [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=28460 blocks=1 instructions=279347 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:49Z INFO 49655 [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-08-07T13:55:49Z INFO 49655 [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-08-07T13:55:49Z USER 49655 [ModuleForkPass]: dma_optimization_psum finished after 0.179 seconds +2025-08-07T13:55:49Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1683mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:49Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28460 memory location(s), 1 block(s), and 279347 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:49Z USER 49655 [ModuleForkPass]: Running address_rotation_psum +2025-08-07T13:55:49Z INFO 49655 [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=28460 blocks=1 instructions=279347 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:49Z INFO 49655 [DMAOptimizationBase]: PSUM Rotation rotated 975 PSUM Banks +2025-08-07T13:55:50Z INFO 49655 [DMAOptimizationBase]: PSUM Rotation rotated 139 PSUM Banks +2025-08-07T13:55:50Z INFO 49655 [DMAOptimizationBase]: PSUM Rotation rotated 388 PSUM Banks +2025-08-07T13:55:50Z USER 49655 [ModuleForkPass]: address_rotation_psum finished after 1.128 seconds +2025-08-07T13:55:50Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1692mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:55:50Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28460 memory location(s), 1 block(s), and 279347 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:50Z USER 49655 [ModuleForkPass]: Running coloring_allocator_sb +2025-08-07T13:55:50Z INFO 49655 [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=28460 blocks=1 instructions=279347 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:55:50Z INFO 49655 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 7611678278 +2025-08-07T13:55:50Z INFO 49655 [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 7149 bytes +2025-08-07T13:55:50Z INFO 49655 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 2812426 +2025-08-07T13:55:50Z INFO 49655 [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 397 bytes +2025-08-07T13:55:50Z INFO 49655 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 78980 +2025-08-07T13:55:50Z INFO 49655 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 136 bytes +2025-08-07T13:55:50Z INFO 49655 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:55:50Z INFO 49655 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:55:50Z INFO 49655 [SB_Allocator]: allocating SB +2025-08-07T13:55:50Z INFO 49655 [SB_Allocator]: main loop +2025-08-07T13:55:50Z INFO 49655 [SB_Allocator]: renumber locations +2025-08-07T13:55:50Z INFO 49655 [SB_Allocator]: size = 15722 +2025-08-07T13:55:50Z INFO 49655 [SB_Allocator]: find partners +2025-08-07T13:55:50Z INFO 49655 [SB_Allocator]: found 11631 accumulation groups +2025-08-07T13:55:50Z INFO 49655 [SB_Allocator]: largest = _dot.10675-t42594 +2025-08-07T13:55:50Z INFO 49655 [SB_Allocator]: tensors = 49 +2025-08-07T13:55:50Z INFO 49655 [SB_Allocator]: requires 393280 bytes/partition +2025-08-07T13:55:50Z WARNING 49655 [SB_Allocator]: accumulation group is too large for SB +2025-08-07T13:55:50Z INFO 49655 [SB_Allocator]: expanding partners +2025-08-07T13:55:50Z INFO 49655 []: find first defs for local +2025-08-07T13:55:51Z INFO 49655 []: find first defs for global +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: find loads +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: 1 pin count +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: 8449 remat count +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: build interference graph +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: pass 1 int-tree +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: Num intervals 15722 Num locations 15722 +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: info.neighbors init Done +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: edge: 159947 +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: mean: 20.3469 +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: median: 11.1546 +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: find costs +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: simplify interference graph +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: safe = 15090 +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: unsafe = 351 +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: inf = 280 +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: total = 15721 +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: simplify +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: simplify_step3_sorted2 #Unsafe 121 #Pinned 0 #Safe 0 minCost 0.00148816 maxCost 1.13634 locations 15722 +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: new candidates = 9 +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: select ranges +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: Total: 15721 +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: Spilled: 0.000 (0) +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: Allocated: 1.000 (15721) +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: Rover zone: 0.960 (15094) +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: Pre-rover zone: 0.031 (492) +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: Post-rover zone: 0.008 (131) +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: Slice zone: 0.000 (4) +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: Blocks nothing: 0.033 (525) +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: Blocks medium: 0.003 (42) +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: Visited until medium blocking (mean): 0.606 +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: Visited until medium blocking (median): 0.578 +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: Visited until medium blocking (p95): 0.912 +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: Blocks tall: 0.964 (15154) +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: Visited until tall blocking (mean): 0.896 +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:55:51Z INFO 49655 [SB_Allocator]: Success +2025-08-07T13:56:16Z INFO 49655 [SB_Allocator]: SB spills = 0 tensors +2025-08-07T13:56:16Z INFO 49655 [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:56:16Z INFO 49655 [SB_Allocator]: remats = 0 tensors +2025-08-07T13:56:16Z INFO 49655 [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:56:16Z INFO 49655 [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:56:16Z INFO 49655 [SB_Allocator]: SB score = 0 +2025-08-07T13:56:16Z INFO 49655 [SB_Allocator]: spilling from SB cost about 0 cycles +2025-08-07T13:56:16Z INFO 49655 [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-08-07T13:56:16Z INFO 49655 [SB_Allocator]: pinning saved approximately 9010 cycles +2025-08-07T13:56:16Z INFO 49655 [SB_Allocator]: 0% SB utilization after allocation +2025-08-07T13:56:16Z INFO 49655 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 7611678278 +2025-08-07T13:56:16Z INFO 49655 [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 7149 bytes +2025-08-07T13:56:16Z INFO 49655 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 2812426 +2025-08-07T13:56:16Z INFO 49655 [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 397 bytes +2025-08-07T13:56:16Z INFO 49655 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 78980 +2025-08-07T13:56:16Z INFO 49655 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 136 bytes +2025-08-07T13:56:16Z USER 49655 [ModuleForkPass]: coloring_allocator_sb finished after 25.747 seconds +2025-08-07T13:56:16Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1700mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:16Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28460 memory location(s), 1 block(s), and 279347 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:16Z USER 49655 [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:56:16Z INFO 49655 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=28460 blocks=1 instructions=279347 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:16Z INFO 49655 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:56:16Z USER 49655 [ModuleForkPass]: address_rotation_sb finished after 0.371 seconds +2025-08-07T13:56:16Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1703mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:16Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28460 memory location(s), 1 block(s), and 279347 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:16Z USER 49655 [ModuleForkPass]: Running dma_optimization_sb +2025-08-07T13:56:16Z INFO 49655 [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=28460 blocks=1 instructions=279347 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:16Z INFO 49655 [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 7614490704, 99.926% input load, 5.25314e-08% output write, 0.073951% spill/reload [sg0000] +2025-08-07T13:56:16Z INFO 49655 [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:56:16Z INFO 49655 [DMAOptimizationBase]: removed 0 identical load +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: [IO to internal DMACopy Insertion]: inserted 0 DMACopy instructions +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 148, 1.94366e-06% out of total dma traffic(7.60886e+09) +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload instructions +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload memory locations +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 4100, 0.0728113% out of total spill/reload dma traffic +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-08-07T13:56:17Z INFO 49655 [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 116 SpillSaves and Reloads +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: average loaded DMA size 7162 bytes +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: average saved DMA size 539 bytes +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 0 SpillSaves and Reloads +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: average loaded DMA size 7162 bytes +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: average saved DMA size 539 bytes +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 7611676080 +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 7162 bytes +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 2810376 +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 539 bytes +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 4248, 5.57884e-05% out of total dma traffic +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 7614486456, 99.9261% input load, 5.25314e-08% output write, 0.0738972% spill/reload [sg0000] +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 7611676080 +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 7162 bytes +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 2810376 +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 539 bytes +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 78980 +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 136 bytes +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 7125 bytes +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-08-07T13:56:18Z INFO 49655 [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-08-07T13:56:18Z USER 49655 [ModuleForkPass]: dma_optimization_sb finished after 2.163 seconds +2025-08-07T13:56:18Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1737mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:18Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279246 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:18Z USER 49655 [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:56:18Z INFO 49655 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=28328 blocks=1 instructions=279246 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:19Z INFO 49655 [DMAOptimizationBase]: SB Rotation rotated 370 Sb address +2025-08-07T13:56:19Z INFO 49655 [DMAOptimizationBase]: SB Rotation rotated 4552 Sb address +2025-08-07T13:56:19Z INFO 49655 [DMAOptimizationBase]: SB Rotation rotated 877 Sb address +2025-08-07T13:56:20Z INFO 49655 [DMAOptimizationBase]: SB Rotation rotated 502 Sb address +2025-08-07T13:56:20Z INFO 49655 [DMAOptimizationBase]: SB Rotation rotated 2195 Sb address +2025-08-07T13:56:20Z INFO 49655 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:56:20Z USER 49655 [ModuleForkPass]: address_rotation_sb finished after 1.909 seconds +2025-08-07T13:56:20Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1737mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:20Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279246 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:20Z USER 49655 [ModuleForkPass]: Running coloring_allocator_dram +2025-08-07T13:56:20Z INFO 49655 [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=28328 blocks=1 instructions=279246 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:20Z INFO 49655 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:56:20Z INFO 49655 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:56:20Z INFO 49655 [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-08-07T13:56:20Z INFO 49655 [DRAM_Allocator]: reserved space = 8344451360 bytes +2025-08-07T13:56:20Z INFO 49655 [DRAM_Allocator]: spill space = 3420676 bytes +2025-08-07T13:56:20Z INFO 49655 [DRAM_Allocator]: aligned spill space = 3469312 bytes +2025-08-07T13:56:20Z INFO 49655 [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:56:20Z INFO 49655 [DRAM_Allocator]: renumber locations +2025-08-07T13:56:20Z INFO 49655 [DRAM_Allocator]: size = 192 +2025-08-07T13:56:20Z INFO 49655 []: find first defs for local +2025-08-07T13:56:20Z INFO 49655 []: find first defs for global +2025-08-07T13:56:21Z INFO 49655 [DRAM_Allocator]: Num intervals 192 Num locations 192 +2025-08-07T13:56:21Z INFO 49655 [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:56:21Z INFO 49655 [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:56:21Z INFO 49655 [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:56:21Z INFO 49655 [DRAM_Allocator]: simplify interference graph +2025-08-07T13:56:21Z INFO 49655 [DRAM_Allocator]: initialize low and high +2025-08-07T13:56:21Z INFO 49655 [DRAM_Allocator]: lo = 192 +2025-08-07T13:56:21Z INFO 49655 [DRAM_Allocator]: hi = 0 +2025-08-07T13:56:21Z INFO 49655 [DRAM_Allocator]: total = 192 +2025-08-07T13:56:21Z INFO 49655 [DRAM_Allocator]: simplify +2025-08-07T13:56:21Z INFO 49655 [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:56:21Z INFO 49655 [DRAM_Allocator]: select ranges +2025-08-07T13:56:21Z INFO 49655 [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:56:21Z INFO 49655 [DRAM_Allocator]: allreduce_dram_hwm 1208320 +2025-08-07T13:56:21Z INFO 49655 [DRAM_Allocator]: Real CC buffer size 1208320 +2025-08-07T13:56:21Z INFO 49655 [DRAM_Allocator]: DRAM hwm after allocation: 3117056 +2025-08-07T13:56:21Z INFO 49655 [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:56:21Z USER 49655 [ModuleForkPass]: coloring_allocator_dram finished after 0.424 seconds +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1737mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279246 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:21Z USER 49655 [ModuleForkPass]: Running address_rotation_dram +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=28328 blocks=1 instructions=279246 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:21Z INFO 49655 [DMAOptimizationBase]: Runtime page size at 512MB +2025-08-07T13:56:21Z INFO 49655 [DMAOptimizationBase]: DRAM hwm before rotation 3117056 +2025-08-07T13:56:21Z INFO 49655 [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-08-07T13:56:21Z INFO 49655 [DMAOptimizationBase]: allreduce hwm 1208320 +2025-08-07T13:56:21Z INFO 49655 [DMAOptimizationBase]: Real CC buffer size 1208320 +2025-08-07T13:56:21Z INFO 49655 [DMAOptimizationBase]: DRAM hwm after rotation 3117056 +2025-08-07T13:56:21Z INFO 49655 [DMAOptimizationBase]: DRAM Rotation rotated 9 Dram address +2025-08-07T13:56:21Z USER 49655 [ModuleForkPass]: address_rotation_dram finished after 0.203 seconds +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1739mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279246 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:21Z USER 49655 [ModuleForkPass]: Running tensorcopy_accel +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=28328 blocks=1 instructions=279246 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:21Z INFO 49655 [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-08-07T13:56:21Z INFO 49655 [TensorCopyAccel::Impl]: Accelerated 72 out of 11973 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-08-07T13:56:21Z USER 49655 [ModuleForkPass]: tensorcopy_accel finished after 0.027 seconds +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1739mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279246 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:21Z USER 49655 [ModuleForkPass]: Running peephole_opts +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=28328 blocks=1 instructions=279246 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:21Z INFO 49655 [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-08-07T13:56:21Z USER 49655 [ModuleForkPass]: peephole_opts finished after 0.107 seconds +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1739mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279284 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:21Z USER 49655 [ModuleForkPass]: Running lower_kernel +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=28328 blocks=1 instructions=279284 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:21Z INFO 49655 [LowerKernel]: Started running LowerKernel +2025-08-07T13:56:21Z INFO 49655 [LowerKernel]: Start of kernel lowering pass, number of insts: 279284, number of allocs: 28328 +2025-08-07T13:56:21Z INFO 49655 [LowerKernel]: Scan BKs time (s): 0.021228 +2025-08-07T13:56:21Z INFO 49655 [LowerKernel]: Lower BKs time (s): 1e-05 +2025-08-07T13:56:21Z USER 49655 [ModuleForkPass]: lower_kernel finished after 0.025 seconds +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1739mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279284 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:21Z USER 49655 [ModuleForkPass]: Running lower_nki_kernel +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=28328 blocks=1 instructions=279284 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:21Z USER 49655 [ModuleForkPass]: lower_nki_kernel finished after 0.023 seconds +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1739mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279284 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:21Z USER 49655 [ModuleForkPass]: Running dynamic_dma_cleanup +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=28328 blocks=1 instructions=279284 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:21Z USER 49655 [ModuleForkPass]: dynamic_dma_cleanup finished after 0.035 seconds +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1742mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279284 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:21Z USER 49655 [ModuleForkPass]: Running birverifier +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=28328 blocks=1 instructions=279284 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:21Z USER 49655 [ModuleForkPass]: birverifier finished after 0.224 seconds +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1742mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279284 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:21Z USER 49655 [ModuleForkPass]: Running dynamic_dma_scan +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=28328 blocks=1 instructions=279284 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:21Z USER 49655 [ModuleForkPass]: dynamic_dma_scan finished after 0.034 seconds +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1742mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279284 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:21Z USER 49655 [ModuleForkPass]: Running build_fdeps +2025-08-07T13:56:21Z INFO 49655 [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=28328 blocks=1 instructions=279284 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:21Z INFO 49655 [build_flow_deps]: Start build fdeps. Invocation: 2Thu Aug 7 13:56:21 2025 +2025-08-07T13:56:21Z INFO 49655 [build_flow_deps]: Allocs: 28328 instructions: 279284 +2025-08-07T13:56:22Z INFO 49655 [build_flow_deps]: Build fdeps inserted 822042 edges +2025-08-07T13:56:22Z INFO 49655 [build_flow_deps]: Done build fdeps 822042 Thu Aug 7 13:56:22 2025 +2025-08-07T13:56:22Z USER 49655 [ModuleForkPass]: build_fdeps finished after 0.637 seconds +2025-08-07T13:56:22Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1756mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:22Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279284 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:22Z USER 49655 [ModuleForkPass]: Running remove_redundancies +2025-08-07T13:56:22Z INFO 49655 [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=28328 blocks=1 instructions=279284 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:22Z INFO 49655 [RemoveRedundancies]: remove_clobbered_writes +2025-08-07T13:56:22Z INFO 49655 [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-08-07T13:56:22Z INFO 49655 [RemoveRedundancies]: remove_useless_insts +2025-08-07T13:56:22Z INFO 49655 [RemoveRedundancies]: remove Useless Instructions: 0 +2025-08-07T13:56:22Z USER 49655 [ModuleForkPass]: remove_redundancies finished after 0.098 seconds +2025-08-07T13:56:22Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1756mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:22Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279284 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:22Z USER 49655 [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:56:22Z INFO 49655 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=28328 blocks=1 instructions=279284 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:22Z INFO 49655 [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:56:22Z INFO 49655 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:56:22Z INFO 49655 [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:56:23Z USER 49655 [ModuleForkPass]: anti_dependency_analyzer finished after 1.253 seconds +2025-08-07T13:56:23Z INFO 49655 [ModuleForkPass]: curr_vmrss: 2188mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:23Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279284 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:23Z USER 49655 [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:56:23Z INFO 49655 [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=28328 blocks=1 instructions=279284 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:24Z INFO 49655 [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:56:24Z INFO 49655 [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:56:24Z USER 49655 [ModuleForkPass]: tensor_copy_elim finished after 0.336 seconds +2025-08-07T13:56:24Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1852mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:24Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279284 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:24Z USER 49655 [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-08-07T13:56:24Z INFO 49655 [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=28328 blocks=1 instructions=279284 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:24Z USER 49655 [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.003 seconds +2025-08-07T13:56:24Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1852mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:24Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279284 instruction(s). Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:24Z USER 49655 [ModuleForkPass]: Running post_sched +2025-08-07T13:56:24Z INFO 49655 [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=28328 blocks=1 instructions=279284 Max writers: 1536 Max Readers: 20539 +2025-08-07T13:56:24Z INFO 49655 [post_scheduler]: Start PosT ScheD 3 sunda Thu Aug 7 13:56:24 2025 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.336-t41529 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.383-t41540 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.668-t41563 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.715-t41574 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.1000-t41597 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.1047-t41608 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.1332-t41631 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.1379-t41642 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.1664-t41665 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.1711-t41676 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.1996-t41699 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.2043-t41710 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.2328-t41733 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.2375-t41744 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.2660-t41767 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.2707-t41778 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.2992-t41801 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.3039-t41812 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.3324-t41835 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.3371-t41846 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.3656-t41869 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.3703-t41880 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.3988-t41903 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.4035-t41914 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.4320-t41937 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.4367-t41948 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.4652-t41971 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.4699-t41982 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.4984-t42005 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.5031-t42016 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.5316-t42039 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.5363-t42050 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.5648-t42073 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.5695-t42084 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.5980-t42107 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.6027-t42118 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.6312-t42141 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.6359-t42152 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.6644-t42175 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.6691-t42186 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.6976-t42209 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.7023-t42220 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.7308-t42243 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.7355-t42254 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.7640-t42277 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.7687-t42288 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.7972-t42311 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.8019-t42322 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.8304-t42345 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.8351-t42356 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.8636-t42379 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.8683-t42390 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.8968-t42413 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.9015-t42424 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.9300-t42447 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.9347-t42458 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.9632-t42481 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.9679-t42492 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.9964-t42515 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.10011-t42526 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.10296-t42549 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.10343-t42560 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.10628-t42583 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.10675-t42594 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.10960-t42617 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.11007-t42628 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.11292-t42651 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.11339-t42662 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.11624-t42685 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.11671-t42696 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.11956-t42719 +2025-08-07T13:56:24Z WARNING 49655 [post_scheduler]: Inserted memset 0 for _dot.12003-t42730 +2025-08-07T13:56:36Z INFO 49655 [post_scheduler]: Time-aware hwm post-sched +2025-08-07T13:56:42Z INFO 49655 [post_scheduler]: Time-aware simulation time: 35008201 +2025-08-07T13:56:43Z INFO 49655 [post_scheduler]: Done PosT ScheD Thu Aug 7 13:56:43 2025 +2025-08-07T13:56:43Z USER 49655 [ModuleForkPass]: post_sched finished after 19.323 seconds +2025-08-07T13:56:43Z INFO 49655 [ModuleForkPass]: curr_vmrss: 2288mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:43Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:43Z USER 49655 [ModuleForkPass]: Running expand_scheduling_units +2025-08-07T13:56:43Z INFO 49655 [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:43Z USER 49655 [ModuleForkPass]: expand_scheduling_units finished after 0.030 seconds +2025-08-07T13:56:43Z INFO 49655 [ModuleForkPass]: curr_vmrss: 2175mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:43Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:43Z USER 49655 [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:56:43Z INFO 49655 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:46Z INFO 49655 [DMAOptimizationBase]: PSUM Rotation rotated 7205 PSUM Banks +2025-08-07T13:56:47Z INFO 49655 [DMAOptimizationBase]: PSUM Rotation rotated 7494 PSUM Banks +2025-08-07T13:56:47Z INFO 49655 [DMAOptimizationBase]: PSUM Rotation rotated 332 PSUM Banks +2025-08-07T13:56:48Z INFO 49655 [DMAOptimizationBase]: SB Rotation rotated 392 Sb address +2025-08-07T13:56:48Z INFO 49655 [DMAOptimizationBase]: SB Rotation rotated 4995 Sb address +2025-08-07T13:56:48Z INFO 49655 [DMAOptimizationBase]: SB Rotation rotated 473 Sb address +2025-08-07T13:56:49Z INFO 49655 [DMAOptimizationBase]: SB Rotation rotated 527 Sb address +2025-08-07T13:56:49Z INFO 49655 [DMAOptimizationBase]: SB Rotation rotated 415 Sb address +2025-08-07T13:56:49Z INFO 49655 [DMAOptimizationBase]: moved 0 MM forward +2025-08-07T13:56:50Z INFO 49655 [DMAOptimizationBase]: SB Rotation rotated 6 Sb address +2025-08-07T13:56:50Z INFO 49655 [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-08-07T13:56:50Z USER 49655 [ModuleForkPass]: address_rotation_sb finished after 6.698 seconds +2025-08-07T13:56:50Z INFO 49655 [ModuleForkPass]: curr_vmrss: 2196mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:50Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:50Z USER 49655 [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:56:50Z INFO 49655 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:50Z INFO 49655 [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:56:50Z INFO 49655 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:56:50Z INFO 49655 [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:56:51Z USER 49655 [ModuleForkPass]: anti_dependency_analyzer finished after 1.204 seconds +2025-08-07T13:56:51Z INFO 49655 [ModuleForkPass]: curr_vmrss: 2319mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:51Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:51Z USER 49655 [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:56:51Z INFO 49655 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:51Z INFO 49655 [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:56:51Z INFO 49655 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-08-07T13:56:51Z INFO 49655 [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:56:51Z USER 49655 [ModuleForkPass]: anti_dependency_analyzer finished after 0.218 seconds +2025-08-07T13:56:51Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1982mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:51Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:51Z USER 49655 [ModuleForkPass]: Running dep_opt +2025-08-07T13:56:51Z INFO 49655 [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:51Z INFO 49655 [build_flow_deps]: Start build fdeps. Invocation: 3Thu Aug 7 13:56:51 2025 +2025-08-07T13:56:51Z INFO 49655 [build_flow_deps]: Allocs: 28328 instructions: 279356 +2025-08-07T13:56:52Z INFO 49655 [build_flow_deps]: Build fdeps inserted 814908 edges +2025-08-07T13:56:52Z INFO 49655 [build_flow_deps]: Done build fdeps 814908 Thu Aug 7 13:56:52 2025 +2025-08-07T13:56:53Z USER 49655 [ModuleForkPass]: dep_opt finished after 1.352 seconds +2025-08-07T13:56:53Z INFO 49655 [ModuleForkPass]: curr_vmrss: 2019mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:53Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z USER 49655 [ModuleForkPass]: Running report_stats +2025-08-07T13:56:53Z INFO 49655 [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z INFO 49655 [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 622329856 │ +│ DMACopy │ Internal │ 1 │ 24576 │ +│ DMACopy │ Internal -> ExternalOutput │ 72 │ 75497472 │ +│ Load │ Const -> Internal │ 78 │ 2394888 │ +│ Load │ ExternalInput -> Internal │ 8268 │ 7606464672 │ +│ Load │ Internal │ 121 │ 2816520 │ +│ Save │ Internal │ 709 │ 2810372 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴────────────┘ + +2025-08-07T13:56:53Z INFO 49655 [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 72 │ +│ 4 │ 52 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 64 │ 73 │ +│ 256 │ 147 │ +│ 512 │ 666 │ +│ 1024 │ 360 │ +│ 2048 │ 2 │ +│ 4096 │ 2 │ +│ 6144 │ 2304 │ +│ 8192 │ 5493 │ +│ 60768 │ 1 │ +│ 60776 │ 4 │ +│ 262144 │ 72 │ +└─────────────────────┴───────┘ + +2025-08-07T13:56:53Z INFO 49655 [ReportStats]: MM Stats: #MatMults 253176 #MatMult-Transposes 20543 +2025-08-07T13:56:53Z INFO 49655 [ReportStats]: IO Tensor size combined: 8342040600 +2025-08-07T13:56:53Z INFO 49655 [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input473 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input76 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input85 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input106 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input96 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input84 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input98 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input109 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input107 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input95 │ ExternalInput │ bfloat16 │ 50331648 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-08-07T13:56:53Z INFO 49655 [ReportStats]: Large (Internal) Tensor Statistics: +┌────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input83_local_38711_i3 │ Internal │ bfloat16 │ 1048576 │ +│ -t70123 │ Internal │ float32 │ 1048576 │ +│ input83_local_38711_i2 │ Internal │ bfloat16 │ 1048576 │ +│ -t70128 │ Internal │ float32 │ 1048576 │ +│ -t70134 │ Internal │ float32 │ 1048576 │ +│ input83_local_38711_i0 │ Internal │ bfloat16 │ 1048576 │ +│ input83_local_38711_i5 │ Internal │ bfloat16 │ 1048576 │ +│ input83_local_38711_i4 │ Internal │ bfloat16 │ 1048576 │ +│ input83_local_38711_i1 │ Internal │ bfloat16 │ 1048576 │ +└────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-08-07T13:56:53Z USER 49655 [ModuleForkPass]: report_stats finished after 0.072 seconds +2025-08-07T13:56:53Z INFO 49655 [ModuleForkPass]: curr_vmrss: 2019mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:53Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z USER 49655 [BackendPassManager]: mod_parallel_pass finished after 73.457 seconds +2025-08-07T13:56:53Z INFO 49655 [BackendPassManager]: curr_vmrss: 2019mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:53Z INFO 49655 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z USER 49655 [BackendPassManager]: Running assign_trigger_engine +2025-08-07T13:56:53Z INFO 49655 [BackendPassManager]: Inputs to assign_trigger_engine: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z INFO 49655 [AssignTriggerEngine]: Assigned trigger engine for 785 DMA instructions. Moved 76 DMA instructions to CC's engines. +2025-08-07T13:56:53Z USER 49655 [BackendPassManager]: assign_trigger_engine finished after 0.166 seconds +2025-08-07T13:56:53Z INFO 49655 [BackendPassManager]: curr_vmrss: 2019mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:53Z INFO 49655 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z USER 49655 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:56:53Z INFO 49655 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z USER 49655 [SubgraphForkPass]: Running lower_local_collectives +2025-08-07T13:56:53Z INFO 49655 [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z USER 49655 [SubgraphForkPass]: lower_local_collectives finished after 0.002 seconds +2025-08-07T13:56:53Z INFO 49655 [SubgraphForkPass]: curr_vmrss: 2019mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:53Z INFO 49655 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z USER 49655 [SubgraphForkPass]: Running extend_shared_lifetimes +2025-08-07T13:56:53Z INFO 49655 [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z USER 49655 [SubgraphForkPass]: extend_shared_lifetimes finished after 0.001 seconds +2025-08-07T13:56:53Z INFO 49655 [SubgraphForkPass]: curr_vmrss: 2019mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:53Z INFO 49655 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z USER 49655 [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:56:53Z INFO 49655 [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z INFO 49655 [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:56:53Z USER 49655 [SubgraphForkPass]: dead_code_elim finished after 0.363 seconds +2025-08-07T13:56:53Z INFO 49655 [SubgraphForkPass]: curr_vmrss: 2020mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:53Z INFO 49655 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z USER 49655 [BackendPassManager]: subgraph_parallel_pass finished after 0.377 seconds +2025-08-07T13:56:53Z INFO 49655 [BackendPassManager]: curr_vmrss: 2020mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:53Z INFO 49655 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z USER 49655 [BackendPassManager]: Running assign_hwdge_engine +2025-08-07T13:56:53Z INFO 49655 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z USER 49655 [BackendPassManager]: assign_hwdge_engine finished after 0.044 seconds +2025-08-07T13:56:53Z INFO 49655 [BackendPassManager]: curr_vmrss: 2020mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:53Z INFO 49655 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z USER 49655 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:56:53Z INFO 49655 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z USER 49655 [ModuleForkPass]: Running alloc_queues +2025-08-07T13:56:53Z INFO 49655 [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z INFO 49655 [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-08-07T13:56:53Z INFO 49655 [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 41 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 124 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 109 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 671 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 5 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 8300 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-08-07T13:56:53Z USER 49655 [ModuleForkPass]: alloc_queues finished after 0.036 seconds +2025-08-07T13:56:53Z INFO 49655 [ModuleForkPass]: curr_vmrss: 2020mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:53Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z USER 49655 [ModuleForkPass]: Running chain_dma_transposes +2025-08-07T13:56:53Z INFO 49655 [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z USER 49655 [ModuleForkPass]: chain_dma_transposes finished after 0.003 seconds +2025-08-07T13:56:53Z INFO 49655 [ModuleForkPass]: curr_vmrss: 2020mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:53Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z USER 49655 [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-08-07T13:56:53Z INFO 49655 [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z USER 49655 [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.001 seconds +2025-08-07T13:56:53Z INFO 49655 [ModuleForkPass]: curr_vmrss: 2020mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:53Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:53Z USER 49655 [ModuleForkPass]: Running lower_control +2025-08-07T13:56:53Z INFO 49655 [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:54Z INFO 49655 [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-08-07T13:56:54Z USER 49655 [ModuleForkPass]: lower_control finished after 0.408 seconds +2025-08-07T13:56:54Z INFO 49655 [ModuleForkPass]: curr_vmrss: 2020mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:54Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:54Z USER 49655 [BackendPassManager]: mod_parallel_pass finished after 0.462 seconds +2025-08-07T13:56:54Z INFO 49655 [BackendPassManager]: curr_vmrss: 2020mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:54Z INFO 49655 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:54Z USER 49655 [BackendPassManager]: Running nc_parallel_pass +2025-08-07T13:56:54Z INFO 49655 [BackendPassManager]: Inputs to nc_parallel_pass: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:54Z USER 49655 [CoreForkPass]: Running dep_reduction +2025-08-07T13:56:54Z INFO 49655 [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:54Z INFO 49655 [DepReduction]: Start Dependency Reduction +2025-08-07T13:56:54Z INFO 49655 [DepReduction]: Processing async instrs... +2025-08-07T13:56:54Z INFO 49655 [DepReduction]: Processing secondary edges per engine... +2025-08-07T13:56:54Z INFO 49655 [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 256119 +2025-08-07T13:56:55Z INFO 49655 [DepReduction]: Processing redundant descendants, Done. Num edges removed 265521 +2025-08-07T13:56:55Z INFO 49655 [DepReduction]: Processing async instrs, Done. Num edges removed 265521 +2025-08-07T13:56:58Z INFO 49655 [DepReduction]: Num Async removed: 0 +2025-08-07T13:56:58Z INFO 49655 [DepReduction]: Finished dependency reduction: 1868038 removed, new total 39407 +2025-08-07T13:56:58Z INFO 49655 [DepReduction]: Finished Dependency Reduction +2025-08-07T13:56:58Z USER 49655 [CoreForkPass]: dep_reduction finished after 3.698 seconds +2025-08-07T13:56:58Z INFO 49655 [CoreForkPass]: curr_vmrss: 2248mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:58Z INFO 49655 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:58Z USER 49655 [CoreForkPass]: Running lower_dynamic_dma +2025-08-07T13:56:58Z INFO 49655 [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:58Z USER 49655 [CoreForkPass]: lower_dynamic_dma finished after 0.160 seconds +2025-08-07T13:56:58Z INFO 49655 [CoreForkPass]: curr_vmrss: 2243mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:58Z INFO 49655 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:58Z USER 49655 [CoreForkPass]: Running legalize_dynamic_dma +2025-08-07T13:56:58Z INFO 49655 [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:58Z INFO 49655 [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-08-07T13:56:58Z INFO 49655 [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-08-07T13:56:58Z INFO 49655 [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-08-07T13:56:58Z USER 49655 [CoreForkPass]: legalize_dynamic_dma finished after 0.125 seconds +2025-08-07T13:56:58Z INFO 49655 [CoreForkPass]: curr_vmrss: 2242mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:58Z INFO 49655 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279356 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:58Z USER 49655 [CoreForkPass]: Running lower_dma +2025-08-07T13:56:58Z INFO 49655 [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=28328 blocks=1 instructions=279356 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:58Z INFO 49655 [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 8154/8154 (100% DGE) + power-of-2 partition : 8155/8197 (99.4876% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 8155/8197 (99.4876% DGE) + Cast (DGE/DMA) + 128 partition : 72/72 (100% DGE) + power-of-2 partition : 72/72 (100% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 72/72 (100% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 0/9 (0% DGE) + power-of-2 partition : 0/908 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/908 (0% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 1 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 72/72 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-08-07T13:56:58Z USER 49655 [CoreForkPass]: lower_dma finished after 0.225 seconds +2025-08-07T13:56:58Z INFO 49655 [CoreForkPass]: curr_vmrss: 2242mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:58Z INFO 49655 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279358 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:58Z USER 49655 [CoreForkPass]: Running coalesce_dma_blocks +2025-08-07T13:56:58Z INFO 49655 [CoreForkPass]: Inputs to coalesce_dma_blocks: modules=1 functions=1 allocs=28328 blocks=1 instructions=279358 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:58Z INFO 49655 [CoalesceDmaBlocks]: Coaleseced 37 DMA triggers +2025-08-07T13:56:58Z USER 49655 [CoreForkPass]: coalesce_dma_blocks finished after 0.124 seconds +2025-08-07T13:56:58Z INFO 49655 [CoreForkPass]: curr_vmrss: 2246mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:58Z INFO 49655 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279321 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:58Z USER 49655 [CoreForkPass]: Running expand_all_engine +2025-08-07T13:56:58Z INFO 49655 [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=28328 blocks=1 instructions=279321 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:58Z USER 49655 [CoreForkPass]: expand_all_engine finished after 0.043 seconds +2025-08-07T13:56:58Z INFO 49655 [CoreForkPass]: curr_vmrss: 2242mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:58Z INFO 49655 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279321 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:58Z USER 49655 [CoreForkPass]: Running alloc_semaphores +2025-08-07T13:56:58Z INFO 49655 [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=28328 blocks=1 instructions=279321 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:59Z USER 49655 [CoreForkPass]: alloc_semaphores finished after 0.379 seconds +2025-08-07T13:56:59Z INFO 49655 [CoreForkPass]: curr_vmrss: 2242mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:59Z INFO 49655 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279321 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:59Z USER 49655 [CoreForkPass]: Running expand_inst_late +2025-08-07T13:56:59Z INFO 49655 [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=28328 blocks=1 instructions=279321 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:59Z USER 49655 [CoreForkPass]: expand_inst_late finished after 0.419 seconds +2025-08-07T13:56:59Z INFO 49655 [CoreForkPass]: curr_vmrss: 2242mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:59Z INFO 49655 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279396 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:59Z USER 49655 [CoreForkPass]: Running seq_inst_opt +2025-08-07T13:56:59Z INFO 49655 [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=28328 blocks=1 instructions=279396 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:59Z INFO 49655 [SeqInstOpt]: Removing 71 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:56:59Z USER 49655 [CoreForkPass]: seq_inst_opt finished after 0.033 seconds +2025-08-07T13:56:59Z INFO 49655 [CoreForkPass]: curr_vmrss: 2242mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:59Z INFO 49655 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 279325 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:59Z USER 49655 [CoreForkPass]: Running lower_sync +2025-08-07T13:56:59Z INFO 49655 [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=28328 blocks=1 instructions=279325 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:59Z USER 49655 [CoreForkPass]: lower_sync finished after 0.100 seconds +2025-08-07T13:56:59Z INFO 49655 [CoreForkPass]: curr_vmrss: 2249mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:59Z INFO 49655 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 288100 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:59Z USER 49655 [CoreForkPass]: Running lower_act +2025-08-07T13:56:59Z INFO 49655 [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=28328 blocks=1 instructions=288100 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:59Z USER 49655 [CoreForkPass]: lower_act finished after 0.034 seconds +2025-08-07T13:56:59Z INFO 49655 [CoreForkPass]: curr_vmrss: 2250mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:56:59Z INFO 49655 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 288380 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:59Z USER 49655 [CoreForkPass]: Running lower_dve +2025-08-07T13:56:59Z INFO 49655 [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=28328 blocks=1 instructions=288380 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:56:59Z INFO 49655 [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json +2025-08-07T13:57:00Z USER 49655 [CoreForkPass]: lower_dve finished after 0.337 seconds +2025-08-07T13:57:00Z INFO 49655 [CoreForkPass]: curr_vmrss: 2294mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:57:00Z INFO 49655 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 288380 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:00Z USER 49655 [CoreForkPass]: Running lower_ap +2025-08-07T13:57:00Z INFO 49655 [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=28328 blocks=1 instructions=288380 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:00Z USER 49655 [CoreForkPass]: lower_ap finished after 0.055 seconds +2025-08-07T13:57:00Z INFO 49655 [CoreForkPass]: curr_vmrss: 2251mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:57:00Z INFO 49655 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 288380 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:00Z USER 49655 [CoreForkPass]: Running coloring_allocator_reg +2025-08-07T13:57:00Z INFO 49655 [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=28328 blocks=1 instructions=288380 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:00Z INFO 49655 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:57:00Z INFO 49655 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:57:00Z INFO 49655 [REG_Allocator]: allocating REG +2025-08-07T13:57:00Z INFO 49655 [REG_Allocator]: main loop iteration 1 +2025-08-07T13:57:00Z INFO 49655 [REG_Allocator]: renumber registers +2025-08-07T13:57:00Z INFO 49655 [REG_Allocator]: size = 5 +2025-08-07T13:57:00Z INFO 49655 []: find first defs for local reg +2025-08-07T13:57:00Z INFO 49655 []: find first defs for global reg +2025-08-07T13:57:00Z INFO 49655 [REG_Allocator]: live range analysis +2025-08-07T13:57:00Z INFO 49655 [REG_Allocator]: find costs +2025-08-07T13:57:00Z INFO 49655 [REG_Allocator]: simplify interference graph +2025-08-07T13:57:00Z INFO 49655 [REG_Allocator]: initialize low and high +2025-08-07T13:57:00Z INFO 49655 [REG_Allocator]: lo = 5 +2025-08-07T13:57:00Z INFO 49655 [REG_Allocator]: hi = 0 +2025-08-07T13:57:00Z INFO 49655 [REG_Allocator]: inf = 0 +2025-08-07T13:57:00Z INFO 49655 [REG_Allocator]: total = 5 +2025-08-07T13:57:00Z INFO 49655 [REG_Allocator]: simplify +2025-08-07T13:57:00Z INFO 49655 [REG_Allocator]: new candidates = 0 +2025-08-07T13:57:00Z INFO 49655 [REG_Allocator]: select ranges +2025-08-07T13:57:00Z INFO 49655 [REG_Allocator]: no more spills +2025-08-07T13:57:00Z INFO 49655 [REG_Allocator]: REG score = 0 (lower is better) +2025-08-07T13:57:00Z INFO 49655 [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-08-07T13:57:00Z INFO 49655 [REG_Allocator]: 0% REG utilization after allocation +2025-08-07T13:57:00Z USER 49655 [CoreForkPass]: coloring_allocator_reg finished after 0.419 seconds +2025-08-07T13:57:00Z INFO 49655 [CoreForkPass]: curr_vmrss: 2296mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:57:00Z INFO 49655 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 288380 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:00Z USER 49655 [BackendPassManager]: nc_parallel_pass finished after 6.593 seconds +2025-08-07T13:57:00Z INFO 49655 [BackendPassManager]: curr_vmrss: 2251mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:57:00Z INFO 49655 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 288380 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:00Z USER 49655 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:57:00Z INFO 49655 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=28328 blocks=1 instructions=288380 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:00Z USER 49655 [ModuleForkPass]: Running birverifier +2025-08-07T13:57:00Z INFO 49655 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=28328 blocks=1 instructions=288380 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:01Z USER 49655 [ModuleForkPass]: birverifier finished after 0.267 seconds +2025-08-07T13:57:01Z INFO 49655 [ModuleForkPass]: curr_vmrss: 1994mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:57:01Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 288380 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:01Z USER 49655 [BackendPassManager]: mod_parallel_pass finished after 0.273 seconds +2025-08-07T13:57:01Z INFO 49655 [BackendPassManager]: curr_vmrss: 1994mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:57:01Z INFO 49655 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 288380 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:01Z USER 49655 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:57:01Z INFO 49655 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=28328 blocks=1 instructions=288380 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:01Z USER 49655 [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:57:01Z INFO 49655 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=28328 blocks=1 instructions=288380 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:01Z USER 49655 [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:57:01Z INFO 49655 [SubgraphForkPass]: curr_vmrss: 1994mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:57:01Z INFO 49655 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 288380 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:01Z USER 49655 [BackendPassManager]: subgraph_parallel_pass finished after 0.004 seconds +2025-08-07T13:57:01Z INFO 49655 [BackendPassManager]: curr_vmrss: 1994mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:57:01Z INFO 49655 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 288380 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:01Z USER 49655 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:57:01Z INFO 49655 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=28328 blocks=1 instructions=288380 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:01Z USER 49655 [ModuleForkPass]: Running codegen +2025-08-07T13:57:01Z INFO 49655 [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=28328 blocks=1 instructions=288380 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:01Z INFO 49655 [Codegen]: Total compiler allocated DRAM tensors: 0.00290298 GB +2025-08-07T13:57:01Z INFO 49655 [Codegen]: Total un-allocated DRAM tensors by kind: +2025-08-07T13:57:01Z INFO 49655 [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 7.69882 │ +│ ExternalOutput │ 3.72529e-09 │ +│ Const │ 0.0022452 │ +└────────────────┴─────────────┘ + +2025-08-07T13:57:01Z INFO 49655 [Codegen]: Total runtime managed DRAM tensors: 7.70106 GB +2025-08-07T13:57:02Z INFO 49655 [Codegen]: Instruction Stats: +2025-08-07T13:57:02Z INFO 49655 [Codegen]: +┌─────────────────────┬────────┐ +│ Opcode │ Count │ +├─────────────────────┼────────┤ +│ MATMUL │ 253176 │ +│ LDWEIGHTS │ 253068 │ +│ ACTIVATE │ 12603 │ +│ EVENT_SEMAPHORE │ 8775 │ +│ UNKNOWN(0xd4) │ 8300 │ +│ TENSOR_TENSOR │ 1341 │ +│ PSEUDO_DMA_TRIGGER │ 914 │ +│ LOAD_MASK_SELECT │ 546 │ +│ STREAM_SHUFFLE │ 510 │ +│ MATCH_VALUE_LOAD │ 441 │ +│ MEMSET │ 370 │ +│ TENSOR_SCALAR_ADDR │ 345 │ +│ TENSOR_SCALAR │ 332 │ +│ ACT_TABLE_LOAD │ 280 │ +│ CAST │ 239 │ +│ FIND_INDEX8 │ 224 │ +│ MAX8 │ 224 │ +│ MATCH_REPLACE8 │ 217 │ +│ UNKNOWN(0xda) │ 148 │ +│ GATHER │ 99 │ +│ POOL_BUFFER_LOAD │ 99 │ +│ TENSOR_REDUCE │ 79 │ +│ UNKNOWN(0xd9) │ 75 │ +│ RECIPROCAL │ 75 │ +│ IOTA │ 73 │ +│ COPY │ 73 │ +│ UNKNOWN(0xe8) │ 38 │ +│ UNKNOWN(0x8d) │ 36 │ +│ STREAM_TRANSPOSE │ 36 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ ALU_OP │ 2 │ +│ UNKNOWN(0xe5) │ 2 │ +│ MOVE │ 1 │ +│ NOP │ 1 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +│ RNG │ 1 │ +│ TENSOR_SCALAR │ 1 │ +└─────────────────────┴────────┘ + +2025-08-07T13:57:02Z INFO 49655 [Codegen]: +┌────────────┬────────┐ +│ Engine │ Count │ +├────────────┼────────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 13653 │ +│ Scalar │ 14460 │ +│ Tensor │ 509403 │ +│ SyncDMA │ 0 │ +│ Vector │ 5043 │ +│ Sync │ 196 │ +│ All │ 0 │ +└────────────┴────────┘ + +2025-08-07T13:57:02Z INFO 49655 [Codegen]: Total instructions: 542755 (0.0323507 GB) +2025-08-07T13:57:02Z INFO 49655 [Codegen]: Total DynamicDMA instruction count: 8300 +2025-08-07T13:57:02Z USER 49655 [Codegen]: isa_gen finished after 1.196 seconds +2025-08-07T13:57:02Z INFO 49655 [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────┼────────────────┤ +│ qActSpillReload0 │ 5932 │ +│ qDVESpillReload0 │ 264 │ +│ qPoolIO0 │ 2 │ +│ qPoolSpillReload0 │ 9100 │ +│ qSPIO0 │ 88 │ +│ qSPSpillReload0 │ 14558 │ +└───────────────────┴────────────────┘ + +Total descriptors: 29944 (0.0004462 GB) +2025-08-07T13:57:02Z INFO 49655 [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolDynamic │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +│ qPoolIO0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 112 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-08-07T13:57:02Z INFO 49655 [Codegen]: Tensors with largest descriptor count: +┌────────────────────────────────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├────────────────────────────────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ Coalesced_memloc_cosine.140.56086--cosine.140.56082_81 │ Internal │ float32 │ 3 │ +│ Coalesced_memloc_cosine.140.56266--cosine.140.56262_27 │ Internal │ float32 │ 3 │ +│ Coalesced_memloc_cosine.140.56236--cosine.140.56232_36 │ Internal │ float32 │ 3 │ +│ Coalesced_memloc_cosine.140.56116--cosine.140.56112_72 │ Internal │ float32 │ 3 │ +│ Coalesced_memloc_cosine.140.56196--cosine.140.56192_48 │ Internal │ float32 │ 3 │ +│ Coalesced_memloc_cosine.140.56126--cosine.140.56122_69 │ Internal │ float32 │ 3 │ +│ Coalesced_memloc_cosine.140.56216--cosine.140.56212_42 │ Internal │ float32 │ 3 │ +│ Coalesced_memloc_cosine.140.56136--cosine.140.56132_66 │ Internal │ float32 │ 3 │ +│ input2 │ ExternalInput │ int32 │ 36 │ +│ convert.840 │ Internal │ float32 │ 599 │ +└────────────────────────────────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-08-07T13:57:02Z USER 49655 [Codegen]: dma_desc_gen finished after 0.015 seconds +2025-08-07T13:57:02Z INFO 49655 [Codegen]: Estimated peak DRAM usage: 7.73676 GB +2025-08-07T13:57:02Z INFO 49655 [Codegen]: Generating debug info +2025-08-07T13:57:03Z WARNING 49655 [Codegen]: Found 163 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-08-07T13:57:03Z USER 49655 [Codegen]: debug_info_gen finished after 0.701 seconds +2025-08-07T13:57:03Z USER 49655 [ModuleForkPass]: codegen finished after 1.966 seconds +2025-08-07T13:57:03Z INFO 49655 [ModuleForkPass]: curr_vmrss: 2228mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:57:03Z INFO 49655 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 288380 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:03Z USER 49655 [BackendPassManager]: mod_parallel_pass finished after 1.994 seconds +2025-08-07T13:57:03Z INFO 49655 [BackendPassManager]: curr_vmrss: 2032mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:57:03Z INFO 49655 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 288380 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:03Z USER 49655 [BackendPassManager]: Running neff_packager +2025-08-07T13:57:03Z INFO 49655 [BackendPassManager]: Inputs to neff_packager: modules=1 functions=1 allocs=28328 blocks=1 instructions=288380 Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:03Z WARNING 49655 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.169490.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-08-07T13:57:03Z INFO 49655 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.neff +2025-08-07T13:57:03Z INFO 49655 [NeffFileWriter]: IR signature: cadf729c27cf5c24ae6041ac577fdedf for neff artifacts +2025-08-07T13:57:03Z USER 49655 [BackendPassManager]: neff_packager finished after 0.322 seconds +2025-08-07T13:57:03Z INFO 49655 [BackendPassManager]: curr_vmrss: 2032mb, ru_maxrss: 2402mb (delta=0mb) +2025-08-07T13:57:03Z INFO 49655 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28328 memory location(s), 1 block(s), and 288380 instruction(s). Max writers: 1537 Max Readers: 20539 +2025-08-07T13:57:03Z INFO 49655 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ module │ Peak scratchpad usage: local │ 0.002903 GB │ +│ nc00 │ module │ Total size of allocated tensors: local │ 0.003231 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.002903 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.000000 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.000000 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.002903 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-08-07T13:57:03Z INFO 49655 [BackendDriver]: Backend completed successfully, tearing down. +2025-08-07T13:57:04Z INFO 48510 [job.WalrusDriver.0]: Job #0 finished +2025-08-07T13:57:04Z INFO 48510 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-08-07T13:57:04Z INFO 48510 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-08-07T13:57:04Z INFO 48510 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/home/ubuntu/qwen3/token_generation_model/_tp0_bk2/neuronxcc-gh4n9bnf/sg00", "state_id": "sg00"}' --pipeline BIRLinker +2025-08-07T13:57:04Z INFO 48510 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/token_generation_model/_tp0_bk2/neuronxcc-gh4n9bnf +2025-08-07T13:57:04Z INFO 48510 [job.BIRLinker.0]: Linking not needed. Netlist doesnt exist +2025-08-07T13:57:04Z INFO 48510 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-08-07T13:57:04Z INFO 48510 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-08-07T13:57:04Z INFO 48510 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-08-07T13:57:04Z INFO 48510 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-08-07T13:57:04Z INFO 48510 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-08-07T13:57:04Z INFO 48510 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-08-07T13:57:04Z INFO 48510 [job.NeffWrapper.0]: Processing input #0 +2025-08-07T13:57:04Z INFO 48510 [job.NeffWrapper.0]: Start NeffWrapper +2025-08-07T13:57:04Z INFO 48510 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.hlo_module.pb --neff /home/ubuntu/qwen3/token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.neff --io_transposes /home/ubuntu/qwen3/token_generation_model/_tp0_bk2/neuronxcc-gh4n9bnf/io_transposes.json --output /home/ubuntu/qwen3/token_generation_model/_tp0_bk2/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/token_generation_model/_tp0_bk2/neuronxcc-gh4n9bnf/hlo_netlist.json +2025-08-07T13:57:04Z INFO 48510 [job.NeffWrapper.0]: Could not open file: /home/ubuntu/qwen3/token_generation_model/_tp0_bk2/neuronxcc-gh4n9bnf/hlo_netlist.json +There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-08-07T13:57:04Z INFO 48510 [job.NeffWrapper.0]: Job #0 finished +2025-08-07T13:57:04Z INFO 48510 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-08-07T13:57:04Z INFO 48510 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-08-07T13:57:04Z INFO 48510 [pipeline.Pipeline.0]: Job #0 finished +2025-08-07T13:57:04Z INFO 47984 [root]: Subcommand returned with exitcode=0 diff --git a/token_generation_model/_tp0_bk2/metaneff.pb b/token_generation_model/_tp0_bk2/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..60ecdb879eca595a24215ffbf504c38ca92d0042 --- /dev/null +++ b/token_generation_model/_tp0_bk2/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4501a770a6eb39b950a2af02dd415554181660d67a75873b8061e3e3fb2342ca +size 984551 diff --git a/token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.hlo_module.pb b/token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..f781b1f25b5b321a0235894bddb1870fbd1fbf3e --- /dev/null +++ b/token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c544d0e20f56d5383d37218086cb993108067db2d992950c04a8fb8d9b4a59b +size 1063359 diff --git a/token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.neff b/token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.neff new file mode 100644 index 0000000000000000000000000000000000000000..65f817e2c21386e47706553c4e4d419737c40300 --- /dev/null +++ b/token_generation_model/_tp0_bk2/model.MODULE_0ae1021f5dbf9cbac54d+2aa9c8c9.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e039781a4148e1dd0618aaeaa59cdbeeede45475e212500ec37542014ecafb73 +size 6083584 diff --git a/token_generation_model/_tp0_bk2/neuron_config.json b/token_generation_model/_tp0_bk2/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..59c356e49b3e7029870fa6d29302e2ef188933f6 --- /dev/null +++ b/token_generation_model/_tp0_bk2/neuron_config.json @@ -0,0 +1,220 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "Qwen/Qwen3-8B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 12288, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 36, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 1, + "bucket_n_active_tokens": false, + "buckets": [ + 512 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 1, + "chunked_prefill_config": null, + "context_encoding_buckets": null, + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": false, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 1, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": false, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 1, + "lora_config": null, + "max_batch_size": 1, + "max_context_length": 1024, + "max_length": 1024, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 1, + "n_positions": 1024, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 1024, + "pa_num_blocks": 1, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 1024, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 1, + "speculation_length": 0, + "start_rank_id": 0, + "target": null, + "tile_cc": false, + "tkg_batch_size": 1, + "token_generation_buckets": [ + 512 + ], + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 32, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/token_generation_model/_tp0_bk3/command.txt b/token_generation_model/_tp0_bk3/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0aeafee1ba8892361f3b59f83e0c97a4444df03 --- /dev/null +++ b/token_generation_model/_tp0_bk3/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_d3ed4857bd8baeff8023+b05cff0a.hlo_module.pb --output model.MODULE_d3ed4857bd8baeff8023+b05cff0a.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ' --lnc=1 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/token_generation_model/_tp0_bk3/compile_flags.MODULE_d3ed4857bd8baeff8023+b05cff0a.json b/token_generation_model/_tp0_bk3/compile_flags.MODULE_d3ed4857bd8baeff8023+b05cff0a.json new file mode 100644 index 0000000000000000000000000000000000000000..5ace8116d4aebe55a21f4286ba4f68cfab9f34b3 --- /dev/null +++ b/token_generation_model/_tp0_bk3/compile_flags.MODULE_d3ed4857bd8baeff8023+b05cff0a.json @@ -0,0 +1 @@ +["--target=trn1", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ", "--lnc=1", "-O2", "--internal-hlo2tensorizer-options=--verify-hlo=true", "--logfile=/home/ubuntu/qwen3/token_generation_model/_tp0_bk3/log-neuron-cc.txt"] \ No newline at end of file diff --git a/token_generation_model/_tp0_bk3/global_metric_store.json b/token_generation_model/_tp0_bk3/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..b861ec33cf2d9b236cd6132d860d3c58a3a08daa --- /dev/null +++ b/token_generation_model/_tp0_bk3/global_metric_store.json @@ -0,0 +1,540 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 99.84185028076172, + "StaticProfiler::AveragePartitionUtilization": 99.48448944091797, + "StaticProfiler::AveragePeUtilization": 99.56957244873047, + "StaticProfiler::LocalizationEfficiency": 109.03128051757813, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 109.1170883178711, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1, + "StaticProfiler::AveragePartitionUtilization": 1, + "StaticProfiler::AveragePeUtilization": 1, + "StaticProfiler::LocalizationEfficiency": 1, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 1.969146728515625, + "AffinePredicateResolution": 0.05552172660827637, + "AliasDependencyElimination": 0.002666950225830078, + "AliasDependencyInduction": 0.4502143859863281, + "AliasDependencyReset": 0.4788334369659424, + "BFComputeCutting": 0.058393001556396484, + "BirCodeGenLoop": 2.2788913249969482, + "CCOpFusion": 0.47452735900878906, + "CanonicalizeConv": 0.00029399999766610563, + "CanonicalizeDAGForPGTiling": 0.2187955379486084, + "CanonicalizeForTensorizer": 0.000450999999884516, + "CanonicalizeIR": 0.07366371154785156, + "Canonicalizer": 0.007921000011265278, + "CoalesceCCOp": 0.19643092155456543, + "CommuteConcat": 0.04170727729797363, + "DMALocalityOpt": 0.03731536865234375, + "DMAProfiler": 0.09144282341003418, + "DMATilingProfiler": 0.07170844078063965, + "DataLocalityOpt": 1.9889020919799805, + "DataStreaming": 0.16355490684509277, + "DeConcat": 0.021522998809814453, + "DeadCodeElimination": 0.0381464958190918, + "DeadStoreElimination": 0.41367006301879883, + "DelinearIndices": 0.33849334716796875, + "Delinearization": 0.20084929466247559, + "DoNothing": 0.0001838207244873047, + "DramToDramTranspose": 1.0951581001281738, + "DumpGraphAndMetadata": 0.25267457962036133, + "EliminateDivs": 0.18700790405273438, + "ExpandBatchNorm": 0.0730886459350586, + "ExpandISAMacro": 0.09423613548278809, + "FactorizeBlkDims": 0.4047412872314453, + "FactorizeThreadAxesInFreeDims": 0.04455876350402832, + "FlattenMacroLoop": 0.0804436206817627, + "GenericAccessSimplifier": 0.03617668151855469, + "HoistCompute": 7.899999764049426e-05, + "IdentifyCrossPassTensors": 0.00021100000594742596, + "InferInitValue": 1.0891494750976563, + "InferIntrinsicOnCC": 0.365936279296875, + "InferNeuronTensor": 1.7837121486663818, + "InferNonlocalTensors": 4.328860282897949, + "InferPSumTensor": 1.4249539375305176, + "InlineNativeKernels": 0.05753588676452637, + "InsertIOTransposes": 0.991222620010376, + "InsertLocalTransposes": 0.923569917678833, + "InsertOffloadedTransposes": 0.09548735618591309, + "LICM": 0.1112966537475586, + "LateLegalizeInst": 0.3268578052520752, + "LateLegalizePostSplit": 0.09773421287536621, + "LateLowerReshapeOp": 0.045319557189941406, + "LateLowerTensorOp": 0.3748793601989746, + "LateNeuronInstComb": 0.4980580806732178, + "LayoutPreprocessing": 1.0078740119934082, + "LayoutPreprocessingAndAnalysis": 1.342142105102539, + "LayoutRequirementAnalysis": 0.3213934898376465, + "LegalizeCCOpLayout": 0.08592772483825684, + "LegalizeOpLevelAlias": 0.040465354919433594, + "LegalizePartitionReduce": 0.0850517749786377, + "LegalizeSundaAccess": 1.400620698928833, + "LegalizeSundaMacro": 0.43457961082458496, + "LegalizeType": 0.19485044479370117, + "LocalLayoutOpt": 0.3885209560394287, + "LoopFusion": 0.3380746841430664, + "LoopSplitting": 0.013431549072265625, + "LowerBroadcast": 0.05305767059326172, + "LowerCCOpBlockAxis": 0.23342013359069824, + "LowerComplexBroadcast": 0.15745258331298828, + "LowerIntrinsics": 1.2234890460968018, + "LowerTensorOp": 0.5340566635131836, + "LowerTranspose": 0.3922152519226074, + "MacroGeneration": 2.18086314201355, + "MaskPropagation": 0.1404891014099121, + "MemcastMotion": 0.00022899999748915434, + "MemcpyElimination": 4.92922306060791, + "MutateDataType": 0.0495760440826416, + "NeuronAliasDependencyInduction": 0.025828838348388672, + "NeuronAliasDependencyReset": 0.036108970642089844, + "NeuronInstComb": 0.21550488471984863, + "NeuronLICM": 0.2960355281829834, + "NeuronLoopFusion": 0.5112388134002686, + "NeuronLoopInterchange": 0.049500465393066406, + "NeuronSimplifier": 0.33487582206726074, + "NeuronSimplifyPredicates": 0.18529582023620605, + "NeuronValueNumbering": 0.1114494800567627, + "OptimizeAliasedCopyChain": 0.01750659942626953, + "OptimizeNKIKernels": 0.4911010265350342, + "PAGLayoutOpt": 29.41068458557129, + "PComputeCutting": 0.49399471282958984, + "PGLayoutTilingPipeline": 43.084712982177734, + "PGTiling": 4.971738815307617, + "PadElimination": 0.009459257125854492, + "ParAxesAnnotation": 28.47618865966797, + "PartialLoopFusion": 0.4028160572052002, + "PartialSimdFusion": 0.3455219268798828, + "PenguinizeFunctions": 0.0002229999954579398, + "PerfectLoopNest": 0.06593203544616699, + "PruneFunctions": 0.0006189999985508621, + "RecognizeOpIdiom": 0.21458721160888672, + "Recompute": 0.007538557052612305, + "RelaxPredicates": 0.15262603759765625, + "Rematerialization": 0.17077207565307617, + "RemoveOptimizationBarriers": 0.0003169999981764704, + "ReshapeWeights": 0.02144646644592285, + "ResolveAccessConflict": 0.3060123920440674, + "ResolveComplicatePredicates": 0.05498790740966797, + "RewriteReplicationMatmul": 0.04878973960876465, + "RewriteWeights": 0.06134176254272461, + "SFKVectorizer": 4.838059902191162, + "ScatterMotion": 0.004941000137478113, + "SimpleAllReduceTiling": 0.07000184059143066, + "Simplifier": 0.12532973289489746, + "SimplifyMacroPredicates": 0.18832993507385254, + "SimplifyNeuronTensor": 1.3622710704803467, + "SimplifySlice": 0.037230491638183594, + "SimplifyTensor": 0.20799946784973145, + "SpillPSum": 0.4076821804046631, + "SplitAPUnionSets": 0.3596620559692383, + "SplitAccGrp": 0.0434412956237793, + "StaticProfiler": 0.14051222801208496, + "StaticTransposeLocalTensor": 0.22574949264526367, + "SundaISel": 1.6489593982696533, + "TCTransform": 0.04065871238708496, + "TensorInitialization": 0.2030961513519287, + "TensorOpSimplifier": 0.46704745292663574, + "TensorOpTransform": 1.5862066745758057, + "TensorizerLegalizationPass": 0.0001829999964684248, + "TileCCOps": 0.22340822219848633, + "TilingProfiler": 0.5842618942260742, + "TransformConvOp": 0.07411432266235352, + "TritiumFusion": 1.2081208229064941, + "ValueNumbering": 0.10727286338806152, + "VectorizeDMA": 0.03964352607727051, + "VectorizeMatMult": 0.023961544036865234, + "VerifySupportedOps": 0.00027600000612437725, + "WeightCoalescing": 0.05985283851623535, + "ZeroSizeTensorElimination": 0.0008418560028076172, + "algsimp": 0.0025730000343173742, + "batchnorm_expander": 0.000977999996393919, + "boundary-marker-removal": 0.000506000011228025, + "call-inliner": 0.000493000028654933, + "canonicalize-boundary-marker": 0.0005740000051446259, + "collective-stream-id-checker": 7.599999662488699e-05, + "comparison-expander": 0.0005489999894052744, + "computation-deduplicator": 0.0005189999938011169, + "conditional-to-select": 0.00014400000509340316, + "config-lowering": 0.000371000001905486, + "constant_folding": 0.0003169999981764704, + "cse": 0.0007699999841861427, + "dce": 7.000000186963007e-05, + "dynamic-slice-transpose": 0.00025599999935366213, + "eliminate-redundant-compare": 0.0002209999947808683, + "emit-offloaded-dropout": 0.0004670000053010881, + "flatten-call-graph": 0.0004189999890513718, + "fuse-send-recv": 0.0021180000621825457, + "hilo::LegalizeAlias": 0.003845999948680401, + "hilo::NeuronInstCombine": 0.001500000013038516, + "hilo::NeuronOpFusion": 0.0006680000224150717, + "hilo::ReplaceTokenTypeWithU8Pass": 0.0006360000115819275, + "hilo::ScheduleFusion": 4.199999966658652e-05, + "hilo::SixtyFourHack": 0.0003600000054575503, + "hilo::VerifyAliasing": 8.199999865610152e-05, + "hlo-mac-count": 0.0010760000441223383, + "hlo-verifier": 0.007385000120848417, + "io-con-pipe-begin": 7.000000096013537e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0014100000262260437, + "legalize-ccops": 2.5999999706982635e-05, + "legalize-compare": 0.0004780000017490238, + "lower-argminmax-custom-call": 0.0002749999985098839, + "map-inline": 0.0007609999738633633, + "metadata-naming": 0.0012600000482052565, + "mlir::detail::OpToOpPassAdaptor": 0.0002739999908953905, + "mlir::hlo::MhloToPyPenguin": 0.03262700140476227, + "mlir::mhlo::LowerComplexExtraPass": 0.0031580000650137663, + "mlir::mhlo::LowerComplexPass": 0.00228899996727705, + "native-to-custom-softmax": 0.0005099999834783375, + "native-to-custom-softmax-dx": 0.0005740000051446259, + "operand_upcaster": 0.0009139999747276306, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.06880500167608261, + "pre-hlo-begin": 4.999999873689376e-06, + "pre-hlo-end": 9.999999974752427e-07, + "replace-minimum-constant": 0.00031400000443682075, + "reshape-mover": 0.00012700000661425292, + "simplify-concat": 0.002675000112503767, + "simplify-while-loops": 9.500000305706635e-05, + "transform-variadic-reduce": 0.0008009999874047935, + "tuple-simplifier": 0.00026699999580159783, + "unpack-nested-aws-ntwsr": 0.0005019999807700515, + "unroll-while-loop": 1.700000029813964e-05 + }, + "hilo": { + "HloMacCount": 3935117312.0, + "Traffic": 8267158016.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 267577, + "StaticProfiler::AifUb": 11.113597869873047, + "StaticProfiler::ArithmeticIntensityTensorizer": 12.117298126220703, + "StaticProfiler::AverageDmaLength": 6709.45703125, + "StaticProfiler::DDRTransferBytes": 7653265752, + "StaticProfiler::InternalTransferBytes": 667425812, + "StaticProfiler::LoadExpanded": 1033415, + "StaticProfiler::StoreExpanded": 3422, + "StaticProfiler::TotalDMAExpanded": 1036837, + "StaticProfiler::TotalDynamicInstancesCount": 279479, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 278490, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 79, + "TilingProfiler::MatMultInstructionsAfterTiling": 233424, + "TilingProfiler::NumPfTransposes": 327, + "TilingProfiler::NumPfTransposesForIo": 38, + "TilingProfiler::NumPfTransposesForLocal": 144, + "TilingProfiler::NumPfTransposesForNonlocal": 145, + "TilingProfiler::PfTransposeInstructions": 20702, + "TilingProfiler::PfTransposeInstructionsForIo": 20161, + "TilingProfiler::PfTransposeInstructionsForLocal": 396, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 145, + "TilingProfiler::ReduceInstructionsAfterTiling": 362, + "TilingProfiler::SimdInstructionsAfterTiling": 3755, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + }, + "all": { + "compiletime": { + "CanonicalizeConv": 0.00029399999766610563, + "CanonicalizeForTensorizer": 0.000450999999884516, + "Canonicalizer": 0.007921000011265278, + "HoistCompute": 7.899999764049426e-05, + "IdentifyCrossPassTensors": 0.00021100000594742596, + "MemcastMotion": 0.00022899999748915434, + "PenguinizeFunctions": 0.0002229999954579398, + "PruneFunctions": 0.0006189999985508621, + "RemoveOptimizationBarriers": 0.0003169999981764704, + "ScatterMotion": 0.004941000137478113, + "TensorizerLegalizationPass": 0.0001829999964684248, + "VerifySupportedOps": 0.00027600000612437725, + "algsimp": 0.0025730000343173742, + "batchnorm_expander": 0.000977999996393919, + "boundary-marker-removal": 0.000506000011228025, + "call-inliner": 0.000493000028654933, + "canonicalize-boundary-marker": 0.0005740000051446259, + "collective-stream-id-checker": 7.599999662488699e-05, + "comparison-expander": 0.0005489999894052744, + "computation-deduplicator": 0.0005189999938011169, + "conditional-to-select": 0.00014400000509340316, + "config-lowering": 0.000371000001905486, + "constant_folding": 0.0003169999981764704, + "cse": 0.0007699999841861427, + "dce": 7.000000186963007e-05, + "dynamic-slice-transpose": 0.00025599999935366213, + "eliminate-redundant-compare": 0.0002209999947808683, + "emit-offloaded-dropout": 0.0004670000053010881, + "flatten-call-graph": 0.0004189999890513718, + "fuse-send-recv": 0.0021180000621825457, + "hilo::LegalizeAlias": 0.003845999948680401, + "hilo::NeuronInstCombine": 0.001500000013038516, + "hilo::NeuronOpFusion": 0.0006680000224150717, + "hilo::ReplaceTokenTypeWithU8Pass": 0.0006360000115819275, + "hilo::ScheduleFusion": 4.199999966658652e-05, + "hilo::SixtyFourHack": 0.0003600000054575503, + "hilo::VerifyAliasing": 8.199999865610152e-05, + "hlo-mac-count": 0.0010760000441223383, + "hlo-verifier": 0.007385000120848417, + "io-con-pipe-begin": 7.000000096013537e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0014100000262260437, + "legalize-ccops": 2.5999999706982635e-05, + "legalize-compare": 0.0004780000017490238, + "lower-argminmax-custom-call": 0.0002749999985098839, + "map-inline": 0.0007609999738633633, + "metadata-naming": 0.0012600000482052565, + "mlir::detail::OpToOpPassAdaptor": 0.0002739999908953905, + "mlir::hlo::MhloToPyPenguin": 0.03262700140476227, + "mlir::mhlo::LowerComplexExtraPass": 0.0031580000650137663, + "mlir::mhlo::LowerComplexPass": 0.00228899996727705, + "native-to-custom-softmax": 0.0005099999834783375, + "native-to-custom-softmax-dx": 0.0005740000051446259, + "operand_upcaster": 0.0009139999747276306, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.06880500167608261, + "pre-hlo-begin": 4.999999873689376e-06, + "pre-hlo-end": 9.999999974752427e-07, + "replace-minimum-constant": 0.00031400000443682075, + "reshape-mover": 0.00012700000661425292, + "simplify-concat": 0.002675000112503767, + "simplify-while-loops": 9.500000305706635e-05, + "transform-variadic-reduce": 0.0008009999874047935, + "tuple-simplifier": 0.00026699999580159783, + "unpack-nested-aws-ntwsr": 0.0005019999807700515, + "unroll-while-loop": 1.700000029813964e-05 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.00020194053649902344, + "DMALocalityOpt": 0.00016427040100097656, + "DMAProfiler": 0.0007579326629638672, + "DataStreaming": 0.00026416778564453125, + "DoNothing": 0.00011372566223144531, + "ExpandISAMacro": 0.0005254745483398438, + "FactorizeBlkDims": 0.00045561790466308594, + "InferPSumTensor": 0.0004715919494628906, + "LateLegalizeInst": 0.00040221214294433594, + "LateNeuronInstComb": 0.0004830360412597656, + "LegalizeSundaAccess": 0.0016069412231445313, + "LegalizeType": 0.00031256675720214844, + "LowerBroadcast": 0.000362396240234375, + "LowerIntrinsics": 0.00021576881408691406, + "LowerTranspose": 0.000415802001953125, + "NeuronInstComb": 0.0004990100860595703, + "NeuronLICM": 0.00041961669921875, + "NeuronSimplifyPredicates": 0.0028657913208007813, + "NeuronValueNumbering": 0.0005724430084228516, + "SFKVectorizer": 0.002659320831298828, + "SimpleAllReduceTiling": 0.00019240379333496094, + "SimplifyNeuronTensor": 0.0004341602325439453, + "SpillPSum": 0.000522613525390625, + "WeightCoalescing": 0.00020360946655273438 + } + }, + "sg00": { + "hilo": { + "ArithmeticIntensity": 0.9519879221916199, + "HloMacCount": 3935117312.0, + "Traffic": 8267158016.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 1.969146728515625, + "AffinePredicateResolution": 0.05552172660827637, + "AliasDependencyElimination": 0.002666950225830078, + "AliasDependencyInduction": 0.4502143859863281, + "AliasDependencyReset": 0.4788334369659424, + "BFComputeCutting": 0.058393001556396484, + "BirCodeGenLoop": 2.2788913249969482, + "CCOpFusion": 0.47452735900878906, + "CanonicalizeDAGForPGTiling": 0.2187955379486084, + "CanonicalizeIR": 0.07366371154785156, + "CoalesceCCOp": 0.1962289810180664, + "CommuteConcat": 0.04170727729797363, + "DMALocalityOpt": 0.03715109825134277, + "DMAProfiler": 0.09068489074707031, + "DMATilingProfiler": 0.07170844078063965, + "DataLocalityOpt": 1.9889020919799805, + "DataStreaming": 0.16329073905944824, + "DeConcat": 0.021522998809814453, + "DeadCodeElimination": 0.0381464958190918, + "DeadStoreElimination": 0.41367006301879883, + "DelinearIndices": 0.33849334716796875, + "Delinearization": 0.20084929466247559, + "DoNothing": 7.009506225585938e-05, + "DramToDramTranspose": 1.0951581001281738, + "DumpGraphAndMetadata": 0.25267457962036133, + "EliminateDivs": 0.18700790405273438, + "ExpandBatchNorm": 0.0730886459350586, + "ExpandISAMacro": 0.09371066093444824, + "FactorizeBlkDims": 0.4042856693267822, + "FactorizeThreadAxesInFreeDims": 0.04455876350402832, + "FlattenMacroLoop": 0.0804436206817627, + "GenericAccessSimplifier": 0.03617668151855469, + "InferInitValue": 1.0891494750976563, + "InferIntrinsicOnCC": 0.365936279296875, + "InferNeuronTensor": 1.7837121486663818, + "InferNonlocalTensors": 4.328860282897949, + "InferPSumTensor": 1.4244823455810547, + "InlineNativeKernels": 0.05753588676452637, + "InsertIOTransposes": 0.991222620010376, + "InsertLocalTransposes": 0.923569917678833, + "InsertOffloadedTransposes": 0.09548735618591309, + "LICM": 0.1112966537475586, + "LateLegalizeInst": 0.32645559310913086, + "LateLegalizePostSplit": 0.09773421287536621, + "LateLowerReshapeOp": 0.045319557189941406, + "LateLowerTensorOp": 0.3748793601989746, + "LateNeuronInstComb": 0.497575044631958, + "LayoutPreprocessing": 1.0078740119934082, + "LayoutPreprocessingAndAnalysis": 1.342142105102539, + "LayoutRequirementAnalysis": 0.3213934898376465, + "LegalizeCCOpLayout": 0.08592772483825684, + "LegalizeOpLevelAlias": 0.040465354919433594, + "LegalizePartitionReduce": 0.0850517749786377, + "LegalizeSundaAccess": 1.3990137577056885, + "LegalizeSundaMacro": 0.43457961082458496, + "LegalizeType": 0.19453787803649902, + "LocalLayoutOpt": 0.3885209560394287, + "LoopFusion": 0.3380746841430664, + "LoopSplitting": 0.013431549072265625, + "LowerBroadcast": 0.052695274353027344, + "LowerCCOpBlockAxis": 0.23342013359069824, + "LowerComplexBroadcast": 0.15745258331298828, + "LowerIntrinsics": 1.2232732772827148, + "LowerTensorOp": 0.5340566635131836, + "LowerTranspose": 0.3917994499206543, + "MacroGeneration": 2.18086314201355, + "MaskPropagation": 0.1404891014099121, + "MemcpyElimination": 4.92922306060791, + "MutateDataType": 0.0495760440826416, + "NeuronAliasDependencyInduction": 0.025828838348388672, + "NeuronAliasDependencyReset": 0.036108970642089844, + "NeuronInstComb": 0.21500587463378906, + "NeuronLICM": 0.29561591148376465, + "NeuronLoopFusion": 0.5112388134002686, + "NeuronLoopInterchange": 0.049500465393066406, + "NeuronSimplifier": 0.33487582206726074, + "NeuronSimplifyPredicates": 0.18243002891540527, + "NeuronValueNumbering": 0.11087703704833984, + "OptimizeAliasedCopyChain": 0.01750659942626953, + "OptimizeNKIKernels": 0.4911010265350342, + "PAGLayoutOpt": 29.41068458557129, + "PComputeCutting": 0.49399471282958984, + "PGLayoutTilingPipeline": 43.084712982177734, + "PGTiling": 4.971738815307617, + "PadElimination": 0.009459257125854492, + "ParAxesAnnotation": 28.47618865966797, + "PartialLoopFusion": 0.4028160572052002, + "PartialSimdFusion": 0.3455219268798828, + "PerfectLoopNest": 0.06593203544616699, + "RecognizeOpIdiom": 0.21458721160888672, + "Recompute": 0.007538557052612305, + "RelaxPredicates": 0.15262603759765625, + "Rematerialization": 0.17077207565307617, + "ReshapeWeights": 0.02144646644592285, + "ResolveAccessConflict": 0.3060123920440674, + "ResolveComplicatePredicates": 0.05498790740966797, + "RewriteReplicationMatmul": 0.04878973960876465, + "RewriteWeights": 0.06134176254272461, + "SFKVectorizer": 4.835400581359863, + "SimpleAllReduceTiling": 0.0698094367980957, + "Simplifier": 0.12532973289489746, + "SimplifyMacroPredicates": 0.18832993507385254, + "SimplifyNeuronTensor": 1.3618369102478027, + "SimplifySlice": 0.037230491638183594, + "SimplifyTensor": 0.20799946784973145, + "SpillPSum": 0.40715956687927246, + "SplitAPUnionSets": 0.3596620559692383, + "SplitAccGrp": 0.0434412956237793, + "StaticProfiler": 0.14051222801208496, + "StaticTransposeLocalTensor": 0.22574949264526367, + "SundaISel": 1.6489593982696533, + "TCTransform": 0.04065871238708496, + "TensorInitialization": 0.2030961513519287, + "TensorOpSimplifier": 0.46704745292663574, + "TensorOpTransform": 1.5862066745758057, + "TileCCOps": 0.22340822219848633, + "TilingProfiler": 0.5842618942260742, + "TransformConvOp": 0.07411432266235352, + "TritiumFusion": 1.2081208229064941, + "ValueNumbering": 0.10727286338806152, + "VectorizeDMA": 0.03964352607727051, + "VectorizeMatMult": 0.023961544036865234, + "WeightCoalescing": 0.05964922904968262, + "ZeroSizeTensorElimination": 0.0008418560028076172 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 267577, + "StaticProfiler::AifUb": 11.113597869873047, + "StaticProfiler::ArithmeticIntensityTensorizer": 12.117298126220703, + "StaticProfiler::AverageDmaLength": 6709.45703125, + "StaticProfiler::AverageFractalPeUtilization": 99.84185028076172, + "StaticProfiler::AveragePartitionUtilization": 99.48448944091797, + "StaticProfiler::AveragePeUtilization": 99.56957244873047, + "StaticProfiler::DDRTransferBytes": 7653265752, + "StaticProfiler::InternalTransferBytes": 667425812, + "StaticProfiler::LoadExpanded": 1033415, + "StaticProfiler::LocalizationEfficiency": 109.03128051757813, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 109.1170883178711, + "StaticProfiler::StoreExpanded": 3422, + "StaticProfiler::TotalDMAExpanded": 1036837, + "StaticProfiler::TotalDynamicInstancesCount": 279479, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 278490, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 79, + "TilingProfiler::MatMultInstructionsAfterTiling": 233424, + "TilingProfiler::NumPfTransposes": 327, + "TilingProfiler::NumPfTransposesForIo": 38, + "TilingProfiler::NumPfTransposesForLocal": 144, + "TilingProfiler::NumPfTransposesForNonlocal": 145, + "TilingProfiler::PfTransposeInstructions": 20702, + "TilingProfiler::PfTransposeInstructionsForIo": 20161, + "TilingProfiler::PfTransposeInstructionsForLocal": 396, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 145, + "TilingProfiler::ReduceInstructionsAfterTiling": 362, + "TilingProfiler::SimdInstructionsAfterTiling": 3755, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + } +} \ No newline at end of file diff --git a/token_generation_model/_tp0_bk3/graph.neff b/token_generation_model/_tp0_bk3/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..5c280045ee159689e7f6140a94f0737a97c3900e --- /dev/null +++ b/token_generation_model/_tp0_bk3/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:487e4feb4f697b81662129763fb99bc19ea4fd42678a67a9e895c701ddb22265 +size 6124544 diff --git a/token_generation_model/_tp0_bk3/log-neuron-cc.txt b/token_generation_model/_tp0_bk3/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..4beb91abace28943c915a7f8517f8b77d7e6d11a --- /dev/null +++ b/token_generation_model/_tp0_bk3/log-neuron-cc.txt @@ -0,0 +1,2934 @@ +2025-08-07T13:53:51Z INFO 47986 [root]: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.hlo_module.pb --output /home/ubuntu/qwen3/token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.neff --target=trn1 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma' --lnc=1 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=/home/ubuntu/qwen3/token_generation_model/_tp0_bk3/log-neuron-cc.txt --verbose=35 +2025-08-07T13:53:51Z INFO 47986 [root]: NeuronX Compiler version 2.20.9961.0+0acef03a Python version 3.10.12 HWM version 2.20.0.9961+0acef03a NumPy version 1.26.4 Running on AMI ami-040348201d80b58ad Running in region usw2-az4 +2025-08-07T13:53:51Z INFO 48501 [root]: XLA detected +2025-08-07T13:53:51Z INFO 48501 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-08-07T13:53:51Z INFO 48501 [root]: Intermediate files stored in /home/ubuntu/qwen3/token_generation_model/_tp0_bk3/neuronxcc-m7dyulmn, output in /home/ubuntu/qwen3/token_generation_model/_tp0_bk3 +2025-08-07T13:53:51Z INFO 48501 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-08-07T13:53:51Z INFO 48501 [pipeline.Pipeline.0]: Processing input #0 +2025-08-07T13:53:51Z INFO 48501 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-08-07T13:53:51Z INFO 48501 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-08-07T13:53:51Z INFO 48501 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-08-07T13:53:51Z INFO 48501 [job.HLOToTensorizer.0]: Processing input #0 +2025-08-07T13:53:51Z INFO 48501 [job.HLOToTensorizer.0]: IR signature: 632099279834ad49336b20fe638b015f3bd9f3d5379c77ef3a7fbaef8cea450e for model.MODULE_d3ed4857bd8baeff8023+b05cff0a.hlo_module.pb +2025-08-07T13:53:51Z INFO 48501 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --layers-per-module=1 --emit-tensor-level-dropout-ops --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-08-07T13:53:51Z INFO 48501 [job.HLOToTensorizer.0]: DEBUG: needsModular? No. macCnt 3935191104 num non-trivial Ops 3786 +INFO: Switching to single-module compile. PrePartitionPipe skipped. +INFO: Found memory bound graph +INFO: Number of Native SoftmaxDx's detected and replaced: 0 +INFO: Number of Native Softmax's detected and replaced: 2 +Replaced 0 dropout sequences with OffloadedDropout +INFO: HloMacCount has found 3935117312 +INFO: Traffic has found 8267158125 +INFO: AIF 0.951988 +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call divide dot exponential gather get-tuple-element iota maximum multiply negate pad parameter reduce reshape rng scatter select sine slice subtract transpose tuple +Warning: Could not open file debug_info_hlo_partitions.json +2025-08-07 13:53:51.877324: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:52] Truncating long HLO operator name %tuple.13231 = tuple(%reshape.5201, %scatter.12235, %scatter.12250, %scatter.12263, %scatter.12278, %scatter.12291, %scatter.12306, %scatter.12319, %scatter.12334, %scatter.12347, %scatter.12362, %scatter.12375, %scatter.12390, %scatter.12403, %scatter.12418, %scatter.12431, %scatter.12446, %scatter.12459, %scatter.12474, %scatter.12487, %scatter.12502, %scatter.12515, %scatter.12530, %scatter.12543, %scatter.12558, %scatter.12571, %scatter.12586, %scatter.12599, %scatter.12614, %scatter.12627, %scatter.... to 512 characters in the compiler's debug metadata +Invoking RemoveOptimizationBarriers pass + +2025-08-07T13:53:51Z INFO 48501 [job.HLOToTensorizer.0]: IR signature: c36f0f5265e2cd08163e83807d255bf10116090bce83b9eb075c7460705d535f for sg0000/HLOToTensorizer +2025-08-07T13:53:52Z INFO 48501 [job.HLOToTensorizer.0]: Job #0 finished +2025-08-07T13:53:52Z INFO 48501 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-08-07T13:53:52Z INFO 48501 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-08-07T13:53:52Z INFO 48501 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-08-07T13:53:52Z INFO 48501 [job.Frontend.0]: Processing input #0 +2025-08-07T13:53:52Z INFO 48501 [job.Frontend.0]: Start model loading +2025-08-07T13:53:52Z INFO 48501 [job.Frontend.0]: Start tensorization +2025-08-07T13:53:52Z INFO 48501 [job.Frontend.0]: Num jobs: 1 +2025-08-07T13:53:52Z USER 48501 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-08-07T13:53:52Z INFO 48501 [Tensorizer]: Frontend did not find netlist info. Switching to flat flow. +2025-08-07T13:53:52Z INFO 48501 [Tensorizer]: Building model from Penguin script "penguin.py"... +2025-08-07T13:53:53Z INFO 48501 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=1 --num-neuroncores-per-sengine=1 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --keep-remat-dma-transpose --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-08-07T13:53:53Z INFO 48501 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:53:53Z INFO 48501 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48501 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:53:53Z INFO 48501 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-08-07T13:53:53Z INFO 48501 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48501 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.040 seconds +2025-08-07T13:53:53Z INFO 48501 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-08-07T13:53:53Z INFO 48501 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48501 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.018 seconds +2025-08-07T13:53:53Z INFO 48501 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:53Z INFO 48501 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:53:53Z INFO 48501 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.134 seconds +2025-08-07T13:53:53Z INFO 48501 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-08-07T13:53:53Z INFO 48501 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-08-07T13:53:53Z INFO 48501 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.074 seconds +2025-08-07T13:53:53Z INFO 48501 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-08-07T13:53:54Z INFO 48501 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48501 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.534 seconds +2025-08-07T13:53:54Z INFO 48501 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:54Z INFO 48501 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:54Z INFO 48501 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48501 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.007 seconds +2025-08-07T13:53:54Z INFO 48501 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:53:54Z INFO 48501 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:53:54Z INFO 48501 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.379 seconds +2025-08-07T13:53:54Z INFO 48501 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.418 seconds +2025-08-07T13:53:54Z INFO 48501 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.467 seconds +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.074 seconds +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=True) +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.086 seconds +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.055 seconds +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.056 seconds +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.199 seconds +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.059 seconds +2025-08-07T13:53:55Z INFO 48501 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 1.212 seconds +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.058 seconds +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.062 seconds +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.062 seconds +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.073 seconds +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.064 seconds +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.187 seconds +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.059 seconds +2025-08-07T13:53:57Z INFO 48501 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-08-07T13:53:59Z INFO 48501 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 48501 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 1.586 seconds +2025-08-07T13:53:59Z INFO 48501 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-08-07T13:53:59Z INFO 48501 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 48501 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.375 seconds +2025-08-07T13:53:59Z INFO 48501 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-08-07T13:53:59Z INFO 48501 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:53:59Z INFO 48501 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:53:59Z INFO 48501 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.006 seconds +2025-08-07T13:53:59Z INFO 48501 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-08-07T13:54:00Z INFO 48501 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:54:00Z INFO 48501 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.450 seconds +2025-08-07T13:54:00Z INFO 48501 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.479 seconds +2025-08-07T13:54:00Z INFO 48501 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-08-07T13:54:05Z INFO 48501 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-08-07T13:54:05Z INFO 48501 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 4.929 seconds +2025-08-07T13:54:05Z INFO 48501 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:54:07Z INFO 48501 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:54:07Z INFO 48501 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 2.138 seconds +2025-08-07T13:54:07Z INFO 48501 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-08-07T13:54:07Z INFO 48501 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-08-07T13:54:07Z INFO 48501 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.171 seconds +2025-08-07T13:54:07Z INFO 48501 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:54:07Z INFO 48501 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:54:07Z INFO 48501 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.408 seconds +2025-08-07T13:54:07Z INFO 48501 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:54:08Z INFO 48501 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:54:08Z INFO 48501 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.520 seconds +2025-08-07T13:54:08Z INFO 48501 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:54:09Z INFO 48501 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=True) +2025-08-07T13:54:09Z INFO 48501 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 1.540 seconds +2025-08-07T13:54:09Z INFO 48501 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:54:10Z INFO 48501 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:54:10Z INFO 48501 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.134 seconds +2025-08-07T13:54:10Z INFO 48501 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:54:10Z INFO 48501 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:54:10Z INFO 48501 [sg0000/Tensorizer/LICM]: LICM finished after 0.087 seconds +2025-08-07T13:54:10Z INFO 48501 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:54:10Z INFO 48501 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-08-07T13:54:10Z INFO 48501 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.156 seconds +2025-08-07T13:54:10Z INFO 48501 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:54:10Z INFO 48501 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-08-07T13:54:10Z INFO 48501 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.546 seconds +2025-08-07T13:54:10Z INFO 48501 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-08-07T13:54:10Z INFO 48501 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-08-07T13:54:10Z INFO 48501 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.037 seconds +2025-08-07T13:54:10Z INFO 48501 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:54:11Z INFO 48501 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:54:11Z INFO 48501 [sg0000/Tensorizer/LICM]: LICM finished after 0.065 seconds +2025-08-07T13:54:11Z INFO 48501 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:54:11Z INFO 48501 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-08-07T13:54:11Z INFO 48501 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.256 seconds +2025-08-07T13:54:11Z INFO 48501 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:54:11Z INFO 48501 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-08-07T13:54:11Z INFO 48501 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.137 seconds +2025-08-07T13:54:11Z INFO 48501 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:54:11Z INFO 48501 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-08-07T13:54:11Z INFO 48501 [sg0000/Tensorizer/LICM]: LICM finished after 0.063 seconds +2025-08-07T13:54:11Z INFO 48501 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-08-07T13:54:11Z INFO 48501 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-08-07T13:54:11Z INFO 48501 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.009 seconds +2025-08-07T13:54:11Z INFO 48501 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:54:11Z INFO 48501 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:54:11Z INFO 48501 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.154 seconds +2025-08-07T13:54:11Z INFO 48501 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.338 seconds +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.036 seconds +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.125 seconds +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/LICM]: LICM finished after 0.065 seconds +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.107 seconds +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.041 seconds +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.042 seconds +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.215 seconds +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.190 seconds +2025-08-07T13:54:12Z INFO 48501 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.414 seconds +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.008 seconds +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.040 seconds +2025-08-07T13:54:13Z INFO 48501 [Tensorizer]: After optimization: 1185 statements +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.050 seconds +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.036 seconds +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.125 seconds +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=8192 is not above min_allgather_tile_size_in_bytes=8388608` +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/TileCCOps]: in bfloat16 (4096,) %'all_gather.1' = AllGatherOp-502 AllGather_add(bfloat16 (2048,) %'gather.1', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((4096,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.47 | hlo_id: 47 | , id = 502 +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/TileCCOps]: in float32 (512,) %'all_gather.2' = AllGatherOp-10757 AllGather_add(float32 (256,) %'add.217', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.12066 | hlo_id: 12066 | , id = 10757 +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=2048 is not above min_allgather_tile_size_in_bytes=8388608` +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/TileCCOps]: in uint32 (512,) %'all_gather.3' = AllGatherOp-10773 AllGather_add(uint32 (256,) %'add.218', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.12201 | hlo_id: 12201 | , id = 10773 +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/TileCCOps]: Finished (changed=False) +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.223 seconds +2025-08-07T13:54:13Z INFO 48501 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:54:14Z INFO 48501 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-08-07T13:54:14Z INFO 48501 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.480 seconds +2025-08-07T13:54:14Z INFO 48501 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:54:14Z INFO 48501 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:54:14Z INFO 48501 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.139 seconds +2025-08-07T13:54:14Z INFO 48501 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:54:14Z INFO 48501 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:54:14Z INFO 48501 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.326 seconds +2025-08-07T13:54:14Z INFO 48501 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-08-07T13:54:14Z INFO 48501 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-08-07T13:54:14Z INFO 48501 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.038 seconds +2025-08-07T13:54:14Z INFO 48501 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-08-07T13:54:14Z INFO 48501 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-08-07T13:54:14Z INFO 48501 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.045 seconds +2025-08-07T13:54:14Z INFO 48501 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-08-07T13:54:15Z INFO 48501 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-08-07T13:54:15Z INFO 48501 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.366 seconds +2025-08-07T13:54:15Z INFO 48501 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-08-07T13:54:15Z INFO 48501 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=True) +2025-08-07T13:54:15Z INFO 48501 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.306 seconds +2025-08-07T13:54:15Z INFO 48501 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:54:15Z INFO 48501 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:54:15Z INFO 48501 [sg0000/Tensorizer/LICM]: LICM finished after 0.072 seconds +2025-08-07T13:54:15Z INFO 48501 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-08-07T13:54:16Z INFO 48501 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=False) +2025-08-07T13:54:16Z INFO 48501 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.389 seconds +2025-08-07T13:54:16Z INFO 48501 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-08-07T13:54:16Z INFO 48501 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-08-07T13:54:16Z INFO 48501 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.338 seconds +2025-08-07T13:54:16Z INFO 48501 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-08-07T13:54:16Z INFO 48501 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:54:16Z INFO 48501 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:54:16Z INFO 48501 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.201 seconds +2025-08-07T13:54:16Z INFO 48501 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-08-07T13:54:16Z INFO 48501 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-08-07T13:54:16Z INFO 48501 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-08-07T13:54:17Z INFO 48501 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-08-07T13:54:17Z INFO 48501 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.201 seconds +2025-08-07T13:54:17Z INFO 48501 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-08-07T13:54:17Z INFO 48501 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 1.008 seconds +2025-08-07T13:54:17Z INFO 48501 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-08-07T13:54:17Z INFO 48501 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.321 seconds +2025-08-07T13:54:18Z INFO 48501 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 1.342 seconds +2025-08-07T13:54:18Z INFO 48501 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-08-07T13:54:18Z INFO 48501 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:54:19Z INFO 48501 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-08-07T13:54:22Z INFO 48501 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-08-07T13:54:22Z INFO 48501 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 4.329 seconds +2025-08-07T13:54:22Z INFO 48501 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-08-07T13:54:22Z INFO 48501 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-08-07T13:54:22Z INFO 48501 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-08-07T13:54:50Z INFO 48501 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-08-07T13:54:50Z INFO 48501 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 28.476 seconds +2025-08-07T13:54:50Z INFO 48501 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-08-07T13:54:51Z INFO 48501 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-08-07T13:54:51Z INFO 48501 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.924 seconds +2025-08-07T13:54:51Z INFO 48501 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 29.411 seconds +2025-08-07T13:54:51Z INFO 48501 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-08-07T13:54:51Z INFO 48501 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-08-07T13:54:51Z INFO 48501 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.140 seconds +2025-08-07T13:54:51Z INFO 48501 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-08-07T13:54:52Z INFO 48501 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-08-07T13:54:52Z INFO 48501 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.219 seconds +2025-08-07T13:54:52Z INFO 48501 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-08-07T13:54:52Z INFO 48501 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-08-07T13:54:52Z INFO 48501 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.233 seconds +2025-08-07T13:54:52Z INFO 48501 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-08-07T13:54:52Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11014 of IO tensor {'CrossPassTensor': ''}bfloat16 %input4|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(1, 'AG2803'), (260, 'AG2797'), (152, 'AG2801')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11291 of IO tensor {'CrossPassTensor': ''}bfloat16 %input6|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(3, 'AG2817'), (260, 'AG2797'), (155, 'AG2815')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11542 of IO tensor {'CrossPassTensor': ''}bfloat16 %input8|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(5, 'AG2829'), (260, 'AG2797'), (158, 'AG2827')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11793 of IO tensor {'CrossPassTensor': ''}bfloat16 %input10|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(7, 'AG2841'), (260, 'AG2797'), (161, 'AG2839')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12044 of IO tensor {'CrossPassTensor': ''}bfloat16 %input12|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(9, 'AG2853'), (260, 'AG2797'), (164, 'AG2851')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12295 of IO tensor {'CrossPassTensor': ''}bfloat16 %input14|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(11, 'AG2865'), (260, 'AG2797'), (167, 'AG2863')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12546 of IO tensor {'CrossPassTensor': ''}bfloat16 %input16|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(13, 'AG2877'), (260, 'AG2797'), (170, 'AG2875')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12797 of IO tensor {'CrossPassTensor': ''}bfloat16 %input18|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(15, 'AG2889'), (260, 'AG2797'), (173, 'AG2887')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13048 of IO tensor {'CrossPassTensor': ''}bfloat16 %input20|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(17, 'AG2901'), (260, 'AG2797'), (176, 'AG2899')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13299 of IO tensor {'CrossPassTensor': ''}bfloat16 %input22|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(19, 'AG2913'), (260, 'AG2797'), (179, 'AG2911')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13550 of IO tensor {'CrossPassTensor': ''}bfloat16 %input24|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(21, 'AG2925'), (260, 'AG2797'), (182, 'AG2923')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13801 of IO tensor {'CrossPassTensor': ''}bfloat16 %input26|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(23, 'AG2937'), (260, 'AG2797'), (185, 'AG2935')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14052 of IO tensor {'CrossPassTensor': ''}bfloat16 %input28|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(25, 'AG2949'), (260, 'AG2797'), (188, 'AG2947')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14303 of IO tensor {'CrossPassTensor': ''}bfloat16 %input30|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(27, 'AG2961'), (260, 'AG2797'), (191, 'AG2959')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14554 of IO tensor {'CrossPassTensor': ''}bfloat16 %input32|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(29, 'AG2973'), (260, 'AG2797'), (194, 'AG2971')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14805 of IO tensor {'CrossPassTensor': ''}bfloat16 %input34|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(31, 'AG2985'), (260, 'AG2797'), (197, 'AG2983')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15056 of IO tensor {'CrossPassTensor': ''}bfloat16 %input36|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(33, 'AG2997'), (260, 'AG2797'), (200, 'AG2995')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15307 of IO tensor {'CrossPassTensor': ''}bfloat16 %input38|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(35, 'AG3009'), (260, 'AG2797'), (203, 'AG3007')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15558 of IO tensor {'CrossPassTensor': ''}bfloat16 %input40|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(37, 'AG3021'), (260, 'AG2797'), (206, 'AG3019')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15809 of IO tensor {'CrossPassTensor': ''}bfloat16 %input42|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(39, 'AG3033'), (260, 'AG2797'), (209, 'AG3031')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16060 of IO tensor {'CrossPassTensor': ''}bfloat16 %input44|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(41, 'AG3045'), (260, 'AG2797'), (212, 'AG3043')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16311 of IO tensor {'CrossPassTensor': ''}bfloat16 %input46|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(43, 'AG3057'), (260, 'AG2797'), (215, 'AG3055')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16562 of IO tensor {'CrossPassTensor': ''}bfloat16 %input48|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(45, 'AG3069'), (260, 'AG2797'), (218, 'AG3067')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16813 of IO tensor {'CrossPassTensor': ''}bfloat16 %input50|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(47, 'AG3081'), (260, 'AG2797'), (221, 'AG3079')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17064 of IO tensor {'CrossPassTensor': ''}bfloat16 %input52|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(49, 'AG3093'), (260, 'AG2797'), (224, 'AG3091')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17315 of IO tensor {'CrossPassTensor': ''}bfloat16 %input54|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(51, 'AG3105'), (260, 'AG2797'), (227, 'AG3103')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17566 of IO tensor {'CrossPassTensor': ''}bfloat16 %input56|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(53, 'AG3117'), (260, 'AG2797'), (230, 'AG3115')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17817 of IO tensor {'CrossPassTensor': ''}bfloat16 %input58|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(55, 'AG3129'), (260, 'AG2797'), (233, 'AG3127')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18068 of IO tensor {'CrossPassTensor': ''}bfloat16 %input60|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(57, 'AG3141'), (260, 'AG2797'), (236, 'AG3139')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18319 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(59, 'AG3153'), (260, 'AG2797'), (239, 'AG3151')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18570 of IO tensor {'CrossPassTensor': ''}bfloat16 %input64|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(61, 'AG3165'), (260, 'AG2797'), (242, 'AG3163')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18821 of IO tensor {'CrossPassTensor': ''}bfloat16 %input66|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(63, 'AG3177'), (260, 'AG2797'), (245, 'AG3175')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19072 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(65, 'AG3189'), (260, 'AG2797'), (248, 'AG3187')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19323 of IO tensor {'CrossPassTensor': ''}bfloat16 %input70|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(67, 'AG3201'), (260, 'AG2797'), (251, 'AG3199')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19574 of IO tensor {'CrossPassTensor': ''}bfloat16 %input72|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(69, 'AG3213'), (260, 'AG2797'), (254, 'AG3211')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19825 of IO tensor {'CrossPassTensor': ''}bfloat16 %input74|NHWC|(4, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(71, 'AG3225'), (260, 'AG2797'), (257, 'AG3223')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11165 of IO tensor {'CrossPassTensor': ''}bfloat16 %input77(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(153, 'AG2807'), (1, 'AG2803'), (80, 'AG2802'), (264, 'AG2806'), (409, 'AG2805')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28445 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(408, 'AG2798'), (261, 'AG2799')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28436 of IO tensor {'CrossPassTensor': ''}bfloat16 %input79|NC|(128, 32) is not sorted, index list (w/ AG ids): [(408, 'AG2798'), (261, 'AG2799')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28442 of IO tensor {'CrossPassTensor': ''}bfloat16 %input81(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(408, 'AG2798'), (261, 'AG2799')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28437 of IO tensor {'CrossPassTensor': ''}bfloat16 %input83(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(408, 'AG2798'), (261, 'AG2799')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28450 of IO tensor {'CrossPassTensor': ''}bfloat16 %input84(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(81, 'AG2812'), (266, 'AG2810'), (154, 'AG2811')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28449 of IO tensor {'CrossPassTensor': ''}bfloat16 %input85|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28447 of IO tensor {'CrossPassTensor': ''}bfloat16 %input86|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28448 of IO tensor {'CrossPassTensor': ''}bfloat16 %input87|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11438 of IO tensor {'CrossPassTensor': ''}bfloat16 %input88(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(156, 'AG2821'), (3, 'AG2817'), (82, 'AG2816'), (269, 'AG2820'), (411, 'AG2819')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28458 of IO tensor {'CrossPassTensor': ''}bfloat16 %input89|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28451 of IO tensor {'CrossPassTensor': ''}bfloat16 %input90|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28455 of IO tensor {'CrossPassTensor': ''}bfloat16 %input92(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28452 of IO tensor {'CrossPassTensor': ''}bfloat16 %input94(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28463 of IO tensor {'CrossPassTensor': ''}bfloat16 %input95(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(83, 'AG2824'), (270, 'AG2822'), (157, 'AG2823')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28462 of IO tensor {'CrossPassTensor': ''}bfloat16 %input96|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28460 of IO tensor {'CrossPassTensor': ''}bfloat16 %input97|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28461 of IO tensor {'CrossPassTensor': ''}bfloat16 %input98|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11689 of IO tensor {'CrossPassTensor': ''}bfloat16 %input99(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(159, 'AG2833'), (5, 'AG2829'), (84, 'AG2828'), (273, 'AG2832'), (412, 'AG2831')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28471 of IO tensor {'CrossPassTensor': ''}bfloat16 %input100|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28464 of IO tensor {'CrossPassTensor': ''}bfloat16 %input101|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28468 of IO tensor {'CrossPassTensor': ''}bfloat16 %input103(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28465 of IO tensor {'CrossPassTensor': ''}bfloat16 %input105(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28476 of IO tensor {'CrossPassTensor': ''}bfloat16 %input106(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(85, 'AG2836'), (274, 'AG2834'), (160, 'AG2835')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28475 of IO tensor {'CrossPassTensor': ''}bfloat16 %input107|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28473 of IO tensor {'CrossPassTensor': ''}bfloat16 %input108|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28474 of IO tensor {'CrossPassTensor': ''}bfloat16 %input109|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11940 of IO tensor {'CrossPassTensor': ''}bfloat16 %input110(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(162, 'AG2845'), (7, 'AG2841'), (86, 'AG2840'), (277, 'AG2844'), (413, 'AG2843')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28484 of IO tensor {'CrossPassTensor': ''}bfloat16 %input111|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28477 of IO tensor {'CrossPassTensor': ''}bfloat16 %input112|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28481 of IO tensor {'CrossPassTensor': ''}bfloat16 %input114(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28478 of IO tensor {'CrossPassTensor': ''}bfloat16 %input116(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28489 of IO tensor {'CrossPassTensor': ''}bfloat16 %input117(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(87, 'AG2848'), (278, 'AG2846'), (163, 'AG2847')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28488 of IO tensor {'CrossPassTensor': ''}bfloat16 %input118|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28486 of IO tensor {'CrossPassTensor': ''}bfloat16 %input119|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28487 of IO tensor {'CrossPassTensor': ''}bfloat16 %input120|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12191 of IO tensor {'CrossPassTensor': ''}bfloat16 %input121(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(165, 'AG2857'), (9, 'AG2853'), (88, 'AG2852'), (281, 'AG2856'), (414, 'AG2855')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28497 of IO tensor {'CrossPassTensor': ''}bfloat16 %input122|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28490 of IO tensor {'CrossPassTensor': ''}bfloat16 %input123|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28494 of IO tensor {'CrossPassTensor': ''}bfloat16 %input125(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28491 of IO tensor {'CrossPassTensor': ''}bfloat16 %input127(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28502 of IO tensor {'CrossPassTensor': ''}bfloat16 %input128(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(89, 'AG2860'), (282, 'AG2858'), (166, 'AG2859')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28501 of IO tensor {'CrossPassTensor': ''}bfloat16 %input129|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28499 of IO tensor {'CrossPassTensor': ''}bfloat16 %input130|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28500 of IO tensor {'CrossPassTensor': ''}bfloat16 %input131|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12442 of IO tensor {'CrossPassTensor': ''}bfloat16 %input132(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(168, 'AG2869'), (11, 'AG2865'), (90, 'AG2864'), (285, 'AG2868'), (415, 'AG2867')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28510 of IO tensor {'CrossPassTensor': ''}bfloat16 %input133|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28503 of IO tensor {'CrossPassTensor': ''}bfloat16 %input134|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28507 of IO tensor {'CrossPassTensor': ''}bfloat16 %input136(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28504 of IO tensor {'CrossPassTensor': ''}bfloat16 %input138(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28515 of IO tensor {'CrossPassTensor': ''}bfloat16 %input139(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(91, 'AG2872'), (286, 'AG2870'), (169, 'AG2871')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28514 of IO tensor {'CrossPassTensor': ''}bfloat16 %input140|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28512 of IO tensor {'CrossPassTensor': ''}bfloat16 %input141|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28513 of IO tensor {'CrossPassTensor': ''}bfloat16 %input142|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12693 of IO tensor {'CrossPassTensor': ''}bfloat16 %input143(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(171, 'AG2881'), (13, 'AG2877'), (92, 'AG2876'), (289, 'AG2880'), (416, 'AG2879')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28523 of IO tensor {'CrossPassTensor': ''}bfloat16 %input144|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28516 of IO tensor {'CrossPassTensor': ''}bfloat16 %input145|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28520 of IO tensor {'CrossPassTensor': ''}bfloat16 %input147(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28517 of IO tensor {'CrossPassTensor': ''}bfloat16 %input149(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28528 of IO tensor {'CrossPassTensor': ''}bfloat16 %input150(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(93, 'AG2884'), (290, 'AG2882'), (172, 'AG2883')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28527 of IO tensor {'CrossPassTensor': ''}bfloat16 %input151|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28525 of IO tensor {'CrossPassTensor': ''}bfloat16 %input152|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28526 of IO tensor {'CrossPassTensor': ''}bfloat16 %input153|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12944 of IO tensor {'CrossPassTensor': ''}bfloat16 %input154(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(174, 'AG2893'), (15, 'AG2889'), (94, 'AG2888'), (293, 'AG2892'), (417, 'AG2891')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28536 of IO tensor {'CrossPassTensor': ''}bfloat16 %input155|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28529 of IO tensor {'CrossPassTensor': ''}bfloat16 %input156|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28533 of IO tensor {'CrossPassTensor': ''}bfloat16 %input158(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28530 of IO tensor {'CrossPassTensor': ''}bfloat16 %input160(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28541 of IO tensor {'CrossPassTensor': ''}bfloat16 %input161(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(95, 'AG2896'), (294, 'AG2894'), (175, 'AG2895')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28540 of IO tensor {'CrossPassTensor': ''}bfloat16 %input162|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28538 of IO tensor {'CrossPassTensor': ''}bfloat16 %input163|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28539 of IO tensor {'CrossPassTensor': ''}bfloat16 %input164|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13195 of IO tensor {'CrossPassTensor': ''}bfloat16 %input165(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(177, 'AG2905'), (17, 'AG2901'), (96, 'AG2900'), (297, 'AG2904'), (418, 'AG2903')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28549 of IO tensor {'CrossPassTensor': ''}bfloat16 %input166|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28542 of IO tensor {'CrossPassTensor': ''}bfloat16 %input167|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28546 of IO tensor {'CrossPassTensor': ''}bfloat16 %input169(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28543 of IO tensor {'CrossPassTensor': ''}bfloat16 %input171(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28554 of IO tensor {'CrossPassTensor': ''}bfloat16 %input172(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(97, 'AG2908'), (298, 'AG2906'), (178, 'AG2907')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28553 of IO tensor {'CrossPassTensor': ''}bfloat16 %input173|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28551 of IO tensor {'CrossPassTensor': ''}bfloat16 %input174|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28552 of IO tensor {'CrossPassTensor': ''}bfloat16 %input175|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13446 of IO tensor {'CrossPassTensor': ''}bfloat16 %input176(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(180, 'AG2917'), (19, 'AG2913'), (98, 'AG2912'), (301, 'AG2916'), (419, 'AG2915')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28562 of IO tensor {'CrossPassTensor': ''}bfloat16 %input177|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28555 of IO tensor {'CrossPassTensor': ''}bfloat16 %input178|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28559 of IO tensor {'CrossPassTensor': ''}bfloat16 %input180(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28556 of IO tensor {'CrossPassTensor': ''}bfloat16 %input182(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28567 of IO tensor {'CrossPassTensor': ''}bfloat16 %input183(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(99, 'AG2920'), (302, 'AG2918'), (181, 'AG2919')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28566 of IO tensor {'CrossPassTensor': ''}bfloat16 %input184|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28564 of IO tensor {'CrossPassTensor': ''}bfloat16 %input185|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28565 of IO tensor {'CrossPassTensor': ''}bfloat16 %input186|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13697 of IO tensor {'CrossPassTensor': ''}bfloat16 %input187(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(183, 'AG2929'), (21, 'AG2925'), (100, 'AG2924'), (305, 'AG2928'), (420, 'AG2927')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28575 of IO tensor {'CrossPassTensor': ''}bfloat16 %input188|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28568 of IO tensor {'CrossPassTensor': ''}bfloat16 %input189|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28572 of IO tensor {'CrossPassTensor': ''}bfloat16 %input191(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28569 of IO tensor {'CrossPassTensor': ''}bfloat16 %input193(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28580 of IO tensor {'CrossPassTensor': ''}bfloat16 %input194(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(101, 'AG2932'), (306, 'AG2930'), (184, 'AG2931')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28579 of IO tensor {'CrossPassTensor': ''}bfloat16 %input195|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28577 of IO tensor {'CrossPassTensor': ''}bfloat16 %input196|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28578 of IO tensor {'CrossPassTensor': ''}bfloat16 %input197|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13948 of IO tensor {'CrossPassTensor': ''}bfloat16 %input198(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(186, 'AG2941'), (23, 'AG2937'), (102, 'AG2936'), (309, 'AG2940'), (421, 'AG2939')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28588 of IO tensor {'CrossPassTensor': ''}bfloat16 %input199|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28581 of IO tensor {'CrossPassTensor': ''}bfloat16 %input200|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28585 of IO tensor {'CrossPassTensor': ''}bfloat16 %input202(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28582 of IO tensor {'CrossPassTensor': ''}bfloat16 %input204(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28593 of IO tensor {'CrossPassTensor': ''}bfloat16 %input205(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(103, 'AG2944'), (310, 'AG2942'), (187, 'AG2943')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28592 of IO tensor {'CrossPassTensor': ''}bfloat16 %input206|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28590 of IO tensor {'CrossPassTensor': ''}bfloat16 %input207|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28591 of IO tensor {'CrossPassTensor': ''}bfloat16 %input208|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14199 of IO tensor {'CrossPassTensor': ''}bfloat16 %input209(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(189, 'AG2953'), (25, 'AG2949'), (104, 'AG2948'), (313, 'AG2952'), (422, 'AG2951')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28601 of IO tensor {'CrossPassTensor': ''}bfloat16 %input210|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28594 of IO tensor {'CrossPassTensor': ''}bfloat16 %input211|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28598 of IO tensor {'CrossPassTensor': ''}bfloat16 %input213(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28595 of IO tensor {'CrossPassTensor': ''}bfloat16 %input215(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28606 of IO tensor {'CrossPassTensor': ''}bfloat16 %input216(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(105, 'AG2956'), (314, 'AG2954'), (190, 'AG2955')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28605 of IO tensor {'CrossPassTensor': ''}bfloat16 %input217|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28603 of IO tensor {'CrossPassTensor': ''}bfloat16 %input218|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28604 of IO tensor {'CrossPassTensor': ''}bfloat16 %input219|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14450 of IO tensor {'CrossPassTensor': ''}bfloat16 %input220(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(192, 'AG2965'), (27, 'AG2961'), (106, 'AG2960'), (317, 'AG2964'), (423, 'AG2963')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28614 of IO tensor {'CrossPassTensor': ''}bfloat16 %input221|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28607 of IO tensor {'CrossPassTensor': ''}bfloat16 %input222|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28611 of IO tensor {'CrossPassTensor': ''}bfloat16 %input224(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28608 of IO tensor {'CrossPassTensor': ''}bfloat16 %input226(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28619 of IO tensor {'CrossPassTensor': ''}bfloat16 %input227(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(107, 'AG2968'), (318, 'AG2966'), (193, 'AG2967')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28618 of IO tensor {'CrossPassTensor': ''}bfloat16 %input228|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28616 of IO tensor {'CrossPassTensor': ''}bfloat16 %input229|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28617 of IO tensor {'CrossPassTensor': ''}bfloat16 %input230|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14701 of IO tensor {'CrossPassTensor': ''}bfloat16 %input231(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(195, 'AG2977'), (29, 'AG2973'), (108, 'AG2972'), (321, 'AG2976'), (424, 'AG2975')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28627 of IO tensor {'CrossPassTensor': ''}bfloat16 %input232|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28620 of IO tensor {'CrossPassTensor': ''}bfloat16 %input233|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28624 of IO tensor {'CrossPassTensor': ''}bfloat16 %input235(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28621 of IO tensor {'CrossPassTensor': ''}bfloat16 %input237(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28632 of IO tensor {'CrossPassTensor': ''}bfloat16 %input238(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(109, 'AG2980'), (322, 'AG2978'), (196, 'AG2979')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28631 of IO tensor {'CrossPassTensor': ''}bfloat16 %input239|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28629 of IO tensor {'CrossPassTensor': ''}bfloat16 %input240|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28630 of IO tensor {'CrossPassTensor': ''}bfloat16 %input241|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14952 of IO tensor {'CrossPassTensor': ''}bfloat16 %input242(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(198, 'AG2989'), (31, 'AG2985'), (110, 'AG2984'), (325, 'AG2988'), (425, 'AG2987')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28640 of IO tensor {'CrossPassTensor': ''}bfloat16 %input243|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28633 of IO tensor {'CrossPassTensor': ''}bfloat16 %input244|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28637 of IO tensor {'CrossPassTensor': ''}bfloat16 %input246(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28634 of IO tensor {'CrossPassTensor': ''}bfloat16 %input248(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28645 of IO tensor {'CrossPassTensor': ''}bfloat16 %input249(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(111, 'AG2992'), (326, 'AG2990'), (199, 'AG2991')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28644 of IO tensor {'CrossPassTensor': ''}bfloat16 %input250|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28642 of IO tensor {'CrossPassTensor': ''}bfloat16 %input251|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28643 of IO tensor {'CrossPassTensor': ''}bfloat16 %input252|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15203 of IO tensor {'CrossPassTensor': ''}bfloat16 %input253(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(201, 'AG3001'), (33, 'AG2997'), (112, 'AG2996'), (329, 'AG3000'), (426, 'AG2999')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28653 of IO tensor {'CrossPassTensor': ''}bfloat16 %input254|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28646 of IO tensor {'CrossPassTensor': ''}bfloat16 %input255|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28650 of IO tensor {'CrossPassTensor': ''}bfloat16 %input257(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28647 of IO tensor {'CrossPassTensor': ''}bfloat16 %input259(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28658 of IO tensor {'CrossPassTensor': ''}bfloat16 %input260(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(113, 'AG3004'), (330, 'AG3002'), (202, 'AG3003')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28657 of IO tensor {'CrossPassTensor': ''}bfloat16 %input261|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28655 of IO tensor {'CrossPassTensor': ''}bfloat16 %input262|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28656 of IO tensor {'CrossPassTensor': ''}bfloat16 %input263|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15454 of IO tensor {'CrossPassTensor': ''}bfloat16 %input264(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(204, 'AG3013'), (35, 'AG3009'), (114, 'AG3008'), (333, 'AG3012'), (427, 'AG3011')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28666 of IO tensor {'CrossPassTensor': ''}bfloat16 %input265|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28659 of IO tensor {'CrossPassTensor': ''}bfloat16 %input266|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28663 of IO tensor {'CrossPassTensor': ''}bfloat16 %input268(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28660 of IO tensor {'CrossPassTensor': ''}bfloat16 %input270(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28671 of IO tensor {'CrossPassTensor': ''}bfloat16 %input271(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(115, 'AG3016'), (334, 'AG3014'), (205, 'AG3015')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28670 of IO tensor {'CrossPassTensor': ''}bfloat16 %input272|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28668 of IO tensor {'CrossPassTensor': ''}bfloat16 %input273|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28669 of IO tensor {'CrossPassTensor': ''}bfloat16 %input274|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15705 of IO tensor {'CrossPassTensor': ''}bfloat16 %input275(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(207, 'AG3025'), (37, 'AG3021'), (116, 'AG3020'), (337, 'AG3024'), (428, 'AG3023')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28679 of IO tensor {'CrossPassTensor': ''}bfloat16 %input276|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28672 of IO tensor {'CrossPassTensor': ''}bfloat16 %input277|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28676 of IO tensor {'CrossPassTensor': ''}bfloat16 %input279(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28673 of IO tensor {'CrossPassTensor': ''}bfloat16 %input281(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28684 of IO tensor {'CrossPassTensor': ''}bfloat16 %input282(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(117, 'AG3028'), (338, 'AG3026'), (208, 'AG3027')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28683 of IO tensor {'CrossPassTensor': ''}bfloat16 %input283|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28681 of IO tensor {'CrossPassTensor': ''}bfloat16 %input284|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28682 of IO tensor {'CrossPassTensor': ''}bfloat16 %input285|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15956 of IO tensor {'CrossPassTensor': ''}bfloat16 %input286(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(210, 'AG3037'), (39, 'AG3033'), (118, 'AG3032'), (341, 'AG3036'), (429, 'AG3035')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28692 of IO tensor {'CrossPassTensor': ''}bfloat16 %input287|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28685 of IO tensor {'CrossPassTensor': ''}bfloat16 %input288|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28689 of IO tensor {'CrossPassTensor': ''}bfloat16 %input290(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28686 of IO tensor {'CrossPassTensor': ''}bfloat16 %input292(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28697 of IO tensor {'CrossPassTensor': ''}bfloat16 %input293(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(119, 'AG3040'), (342, 'AG3038'), (211, 'AG3039')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28696 of IO tensor {'CrossPassTensor': ''}bfloat16 %input294|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28694 of IO tensor {'CrossPassTensor': ''}bfloat16 %input295|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28695 of IO tensor {'CrossPassTensor': ''}bfloat16 %input296|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16207 of IO tensor {'CrossPassTensor': ''}bfloat16 %input297(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(213, 'AG3049'), (41, 'AG3045'), (120, 'AG3044'), (345, 'AG3048'), (430, 'AG3047')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28705 of IO tensor {'CrossPassTensor': ''}bfloat16 %input298|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28698 of IO tensor {'CrossPassTensor': ''}bfloat16 %input299|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28702 of IO tensor {'CrossPassTensor': ''}bfloat16 %input301(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28699 of IO tensor {'CrossPassTensor': ''}bfloat16 %input303(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28710 of IO tensor {'CrossPassTensor': ''}bfloat16 %input304(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(121, 'AG3052'), (346, 'AG3050'), (214, 'AG3051')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28709 of IO tensor {'CrossPassTensor': ''}bfloat16 %input305|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28707 of IO tensor {'CrossPassTensor': ''}bfloat16 %input306|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28708 of IO tensor {'CrossPassTensor': ''}bfloat16 %input307|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16458 of IO tensor {'CrossPassTensor': ''}bfloat16 %input308(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(216, 'AG3061'), (43, 'AG3057'), (122, 'AG3056'), (349, 'AG3060'), (431, 'AG3059')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28718 of IO tensor {'CrossPassTensor': ''}bfloat16 %input309|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28711 of IO tensor {'CrossPassTensor': ''}bfloat16 %input310|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28715 of IO tensor {'CrossPassTensor': ''}bfloat16 %input312(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28712 of IO tensor {'CrossPassTensor': ''}bfloat16 %input314(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28723 of IO tensor {'CrossPassTensor': ''}bfloat16 %input315(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(123, 'AG3064'), (350, 'AG3062'), (217, 'AG3063')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28722 of IO tensor {'CrossPassTensor': ''}bfloat16 %input316|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28720 of IO tensor {'CrossPassTensor': ''}bfloat16 %input317|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28721 of IO tensor {'CrossPassTensor': ''}bfloat16 %input318|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16709 of IO tensor {'CrossPassTensor': ''}bfloat16 %input319(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(219, 'AG3073'), (45, 'AG3069'), (124, 'AG3068'), (353, 'AG3072'), (432, 'AG3071')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28731 of IO tensor {'CrossPassTensor': ''}bfloat16 %input320|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28724 of IO tensor {'CrossPassTensor': ''}bfloat16 %input321|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28728 of IO tensor {'CrossPassTensor': ''}bfloat16 %input323(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28725 of IO tensor {'CrossPassTensor': ''}bfloat16 %input325(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28736 of IO tensor {'CrossPassTensor': ''}bfloat16 %input326(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(125, 'AG3076'), (354, 'AG3074'), (220, 'AG3075')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28735 of IO tensor {'CrossPassTensor': ''}bfloat16 %input327|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28733 of IO tensor {'CrossPassTensor': ''}bfloat16 %input328|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28734 of IO tensor {'CrossPassTensor': ''}bfloat16 %input329|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16960 of IO tensor {'CrossPassTensor': ''}bfloat16 %input330(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(222, 'AG3085'), (47, 'AG3081'), (126, 'AG3080'), (357, 'AG3084'), (433, 'AG3083')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28744 of IO tensor {'CrossPassTensor': ''}bfloat16 %input331|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28737 of IO tensor {'CrossPassTensor': ''}bfloat16 %input332|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28741 of IO tensor {'CrossPassTensor': ''}bfloat16 %input334(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28738 of IO tensor {'CrossPassTensor': ''}bfloat16 %input336(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28749 of IO tensor {'CrossPassTensor': ''}bfloat16 %input337(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(127, 'AG3088'), (358, 'AG3086'), (223, 'AG3087')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28748 of IO tensor {'CrossPassTensor': ''}bfloat16 %input338|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28746 of IO tensor {'CrossPassTensor': ''}bfloat16 %input339|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28747 of IO tensor {'CrossPassTensor': ''}bfloat16 %input340|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17211 of IO tensor {'CrossPassTensor': ''}bfloat16 %input341(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(225, 'AG3097'), (49, 'AG3093'), (128, 'AG3092'), (361, 'AG3096'), (434, 'AG3095')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28757 of IO tensor {'CrossPassTensor': ''}bfloat16 %input342|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28750 of IO tensor {'CrossPassTensor': ''}bfloat16 %input343|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28754 of IO tensor {'CrossPassTensor': ''}bfloat16 %input345(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28751 of IO tensor {'CrossPassTensor': ''}bfloat16 %input347(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28762 of IO tensor {'CrossPassTensor': ''}bfloat16 %input348(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(129, 'AG3100'), (362, 'AG3098'), (226, 'AG3099')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28761 of IO tensor {'CrossPassTensor': ''}bfloat16 %input349|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28759 of IO tensor {'CrossPassTensor': ''}bfloat16 %input350|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28760 of IO tensor {'CrossPassTensor': ''}bfloat16 %input351|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17462 of IO tensor {'CrossPassTensor': ''}bfloat16 %input352(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(228, 'AG3109'), (51, 'AG3105'), (130, 'AG3104'), (365, 'AG3108'), (435, 'AG3107')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28770 of IO tensor {'CrossPassTensor': ''}bfloat16 %input353|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28763 of IO tensor {'CrossPassTensor': ''}bfloat16 %input354|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28767 of IO tensor {'CrossPassTensor': ''}bfloat16 %input356(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28764 of IO tensor {'CrossPassTensor': ''}bfloat16 %input358(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28775 of IO tensor {'CrossPassTensor': ''}bfloat16 %input359(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(131, 'AG3112'), (366, 'AG3110'), (229, 'AG3111')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28774 of IO tensor {'CrossPassTensor': ''}bfloat16 %input360|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28772 of IO tensor {'CrossPassTensor': ''}bfloat16 %input361|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28773 of IO tensor {'CrossPassTensor': ''}bfloat16 %input362|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17713 of IO tensor {'CrossPassTensor': ''}bfloat16 %input363(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(231, 'AG3121'), (53, 'AG3117'), (132, 'AG3116'), (369, 'AG3120'), (436, 'AG3119')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28783 of IO tensor {'CrossPassTensor': ''}bfloat16 %input364|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28776 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28780 of IO tensor {'CrossPassTensor': ''}bfloat16 %input367(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28777 of IO tensor {'CrossPassTensor': ''}bfloat16 %input369(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28788 of IO tensor {'CrossPassTensor': ''}bfloat16 %input370(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(133, 'AG3124'), (370, 'AG3122'), (232, 'AG3123')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28787 of IO tensor {'CrossPassTensor': ''}bfloat16 %input371|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28785 of IO tensor {'CrossPassTensor': ''}bfloat16 %input372|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28786 of IO tensor {'CrossPassTensor': ''}bfloat16 %input373|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 17964 of IO tensor {'CrossPassTensor': ''}bfloat16 %input374(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(234, 'AG3133'), (55, 'AG3129'), (134, 'AG3128'), (373, 'AG3132'), (437, 'AG3131')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28796 of IO tensor {'CrossPassTensor': ''}bfloat16 %input375|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28789 of IO tensor {'CrossPassTensor': ''}bfloat16 %input376|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28793 of IO tensor {'CrossPassTensor': ''}bfloat16 %input378(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28790 of IO tensor {'CrossPassTensor': ''}bfloat16 %input380(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28801 of IO tensor {'CrossPassTensor': ''}bfloat16 %input381(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(135, 'AG3136'), (374, 'AG3134'), (235, 'AG3135')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28800 of IO tensor {'CrossPassTensor': ''}bfloat16 %input382|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28798 of IO tensor {'CrossPassTensor': ''}bfloat16 %input383|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28799 of IO tensor {'CrossPassTensor': ''}bfloat16 %input384|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18215 of IO tensor {'CrossPassTensor': ''}bfloat16 %input385(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(237, 'AG3145'), (57, 'AG3141'), (136, 'AG3140'), (377, 'AG3144'), (438, 'AG3143')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28809 of IO tensor {'CrossPassTensor': ''}bfloat16 %input386|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28802 of IO tensor {'CrossPassTensor': ''}bfloat16 %input387|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28806 of IO tensor {'CrossPassTensor': ''}bfloat16 %input389(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28803 of IO tensor {'CrossPassTensor': ''}bfloat16 %input391(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28814 of IO tensor {'CrossPassTensor': ''}bfloat16 %input392(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(137, 'AG3148'), (378, 'AG3146'), (238, 'AG3147')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28813 of IO tensor {'CrossPassTensor': ''}bfloat16 %input393|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28811 of IO tensor {'CrossPassTensor': ''}bfloat16 %input394|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28812 of IO tensor {'CrossPassTensor': ''}bfloat16 %input395|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18466 of IO tensor {'CrossPassTensor': ''}bfloat16 %input396(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(240, 'AG3157'), (59, 'AG3153'), (138, 'AG3152'), (381, 'AG3156'), (439, 'AG3155')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28822 of IO tensor {'CrossPassTensor': ''}bfloat16 %input397|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28815 of IO tensor {'CrossPassTensor': ''}bfloat16 %input398|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28819 of IO tensor {'CrossPassTensor': ''}bfloat16 %input400(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28816 of IO tensor {'CrossPassTensor': ''}bfloat16 %input402(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28827 of IO tensor {'CrossPassTensor': ''}bfloat16 %input403(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(139, 'AG3160'), (382, 'AG3158'), (241, 'AG3159')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28826 of IO tensor {'CrossPassTensor': ''}bfloat16 %input404|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28824 of IO tensor {'CrossPassTensor': ''}bfloat16 %input405|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28825 of IO tensor {'CrossPassTensor': ''}bfloat16 %input406|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18717 of IO tensor {'CrossPassTensor': ''}bfloat16 %input407(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(243, 'AG3169'), (61, 'AG3165'), (140, 'AG3164'), (385, 'AG3168'), (440, 'AG3167')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28835 of IO tensor {'CrossPassTensor': ''}bfloat16 %input408|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28828 of IO tensor {'CrossPassTensor': ''}bfloat16 %input409|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28832 of IO tensor {'CrossPassTensor': ''}bfloat16 %input411(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28829 of IO tensor {'CrossPassTensor': ''}bfloat16 %input413(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28840 of IO tensor {'CrossPassTensor': ''}bfloat16 %input414(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(141, 'AG3172'), (386, 'AG3170'), (244, 'AG3171')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28839 of IO tensor {'CrossPassTensor': ''}bfloat16 %input415|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28837 of IO tensor {'CrossPassTensor': ''}bfloat16 %input416|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28838 of IO tensor {'CrossPassTensor': ''}bfloat16 %input417|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 18968 of IO tensor {'CrossPassTensor': ''}bfloat16 %input418(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(246, 'AG3181'), (63, 'AG3177'), (142, 'AG3176'), (389, 'AG3180'), (441, 'AG3179')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28848 of IO tensor {'CrossPassTensor': ''}bfloat16 %input419|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28841 of IO tensor {'CrossPassTensor': ''}bfloat16 %input420|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28845 of IO tensor {'CrossPassTensor': ''}bfloat16 %input422(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28842 of IO tensor {'CrossPassTensor': ''}bfloat16 %input424(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28853 of IO tensor {'CrossPassTensor': ''}bfloat16 %input425(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(143, 'AG3184'), (390, 'AG3182'), (247, 'AG3183')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28852 of IO tensor {'CrossPassTensor': ''}bfloat16 %input426|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28850 of IO tensor {'CrossPassTensor': ''}bfloat16 %input427|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28851 of IO tensor {'CrossPassTensor': ''}bfloat16 %input428|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19219 of IO tensor {'CrossPassTensor': ''}bfloat16 %input429(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(249, 'AG3193'), (65, 'AG3189'), (144, 'AG3188'), (393, 'AG3192'), (442, 'AG3191')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28861 of IO tensor {'CrossPassTensor': ''}bfloat16 %input430|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28854 of IO tensor {'CrossPassTensor': ''}bfloat16 %input431|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28858 of IO tensor {'CrossPassTensor': ''}bfloat16 %input433(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28855 of IO tensor {'CrossPassTensor': ''}bfloat16 %input435(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28866 of IO tensor {'CrossPassTensor': ''}bfloat16 %input436(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(145, 'AG3196'), (394, 'AG3194'), (250, 'AG3195')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28865 of IO tensor {'CrossPassTensor': ''}bfloat16 %input437|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28863 of IO tensor {'CrossPassTensor': ''}bfloat16 %input438|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28864 of IO tensor {'CrossPassTensor': ''}bfloat16 %input439|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19470 of IO tensor {'CrossPassTensor': ''}bfloat16 %input440(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(252, 'AG3205'), (67, 'AG3201'), (146, 'AG3200'), (397, 'AG3204'), (443, 'AG3203')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28874 of IO tensor {'CrossPassTensor': ''}bfloat16 %input441|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28867 of IO tensor {'CrossPassTensor': ''}bfloat16 %input442|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28871 of IO tensor {'CrossPassTensor': ''}bfloat16 %input444(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28868 of IO tensor {'CrossPassTensor': ''}bfloat16 %input446(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28879 of IO tensor {'CrossPassTensor': ''}bfloat16 %input447(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(147, 'AG3208'), (398, 'AG3206'), (253, 'AG3207')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28878 of IO tensor {'CrossPassTensor': ''}bfloat16 %input448|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28876 of IO tensor {'CrossPassTensor': ''}bfloat16 %input449|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28877 of IO tensor {'CrossPassTensor': ''}bfloat16 %input450|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19721 of IO tensor {'CrossPassTensor': ''}bfloat16 %input451(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(255, 'AG3217'), (69, 'AG3213'), (148, 'AG3212'), (401, 'AG3216'), (444, 'AG3215')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28887 of IO tensor {'CrossPassTensor': ''}bfloat16 %input452|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28880 of IO tensor {'CrossPassTensor': ''}bfloat16 %input453|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28884 of IO tensor {'CrossPassTensor': ''}bfloat16 %input455(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28881 of IO tensor {'CrossPassTensor': ''}bfloat16 %input457(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28892 of IO tensor {'CrossPassTensor': ''}bfloat16 %input458(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(149, 'AG3220'), (402, 'AG3218'), (256, 'AG3219')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28891 of IO tensor {'CrossPassTensor': ''}bfloat16 %input459|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28889 of IO tensor {'CrossPassTensor': ''}bfloat16 %input460|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28890 of IO tensor {'CrossPassTensor': ''}bfloat16 %input461|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 19972 of IO tensor {'CrossPassTensor': ''}bfloat16 %input462(16, 128, 4, 4, 2, 128) is not sorted, index list (w/ AG ids): [(258, 'AG3229'), (71, 'AG3225'), (150, 'AG3224'), (405, 'AG3228'), (445, 'AG3227')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28900 of IO tensor {'CrossPassTensor': ''}bfloat16 %input463|NHWC|(4, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28893 of IO tensor {'CrossPassTensor': ''}bfloat16 %input464|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28897 of IO tensor {'CrossPassTensor': ''}bfloat16 %input466(4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28894 of IO tensor {'CrossPassTensor': ''}bfloat16 %input468(4, 4, 128, 32, 2, 64) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28905 of IO tensor {'CrossPassTensor': ''}bfloat16 %input469(32, 2, 128, 24, 128) is not sorted, index list (w/ AG ids): [(151, 'AG3232'), (406, 'AG3230'), (259, 'AG3231')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28904 of IO tensor {'CrossPassTensor': ''}bfloat16 %input470|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28902 of IO tensor {'CrossPassTensor': ''}bfloat16 %input471|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28903 of IO tensor {'CrossPassTensor': ''}bfloat16 %input472|NHWC|(2, 24, 128, 32, 128) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 28906 of IO tensor {'CrossPassTensor': ''}bfloat16 %input474|NC|(128, 32) is not sorted, index list (w/ AG ids): [(410, 'AG2808'), (265, 'AG2809')] +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 1.969 seconds +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.226 seconds +2025-08-07T13:54:54Z INFO 48501 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-08-07T13:54:55Z INFO 48501 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-08-07T13:54:55Z INFO 48501 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.494 seconds +2025-08-07T13:54:55Z INFO 48501 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-08-07T13:54:55Z INFO 48501 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-08-07T13:54:55Z INFO 48501 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.058 seconds +2025-08-07T13:54:55Z INFO 48501 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-08-07T13:54:55Z INFO 48501 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-08-07T13:54:55Z INFO 48501 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.013 seconds +2025-08-07T13:54:55Z INFO 48501 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-08-07T13:54:57Z INFO 48501 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-08-07T13:54:57Z INFO 48501 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 2.181 seconds +2025-08-07T13:54:57Z INFO 48501 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 4.972 seconds +2025-08-07T13:54:57Z INFO 48501 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-08-07T13:54:58Z INFO 48501 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-08-07T13:54:58Z INFO 48501 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.991 seconds +2025-08-07T13:54:58Z INFO 48501 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-08-07T13:54:58Z INFO 48501 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-08-07T13:54:58Z INFO 48501 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.095 seconds +2025-08-07T13:54:58Z INFO 48501 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 1.095 seconds +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 43.085 seconds +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 19008: transpose_128x128 +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 19008: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:54:59Z INFO 48501 [sg0000/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:00Z INFO 48501 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-08-07T13:55:00Z INFO 48501 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.584 seconds +2025-08-07T13:55:00Z INFO 48501 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:55:00Z INFO 48501 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:55:00Z INFO 48501 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.457 seconds +2025-08-07T13:55:00Z INFO 48501 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-08-07T13:55:02Z INFO 48501 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-08-07T13:55:02Z INFO 48501 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 1.784 seconds +2025-08-07T13:55:02Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:55:02Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:55:02Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.270 seconds +2025-08-07T13:55:02Z INFO 48501 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:55:02Z INFO 48501 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:55:02Z INFO 48501 [sg0000/Tensorizer/LICM]: LICM finished after 0.094 seconds +2025-08-07T13:55:02Z INFO 48501 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-08-07T13:55:02Z INFO 48501 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-08-07T13:55:02Z INFO 48501 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.049 seconds +2025-08-07T13:55:02Z INFO 48501 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:55:02Z INFO 48501 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:55:02Z INFO 48501 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.138 seconds +2025-08-07T13:55:02Z INFO 48501 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:55:03Z INFO 48501 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:55:03Z INFO 48501 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.130 seconds +2025-08-07T13:55:03Z INFO 48501 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 1.989 seconds +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 19008: transpose_128x128 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 19008: matmul_128x128x1 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x1 +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.072 seconds +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.321 seconds +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12250 | hlo_id: 12250 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12235 | hlo_id: 12235 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12278 | hlo_id: 12278 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12263 | hlo_id: 12263 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12306 | hlo_id: 12306 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12291 | hlo_id: 12291 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12334 | hlo_id: 12334 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12319 | hlo_id: 12319 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12362 | hlo_id: 12362 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12347 | hlo_id: 12347 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12390 | hlo_id: 12390 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12375 | hlo_id: 12375 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12418 | hlo_id: 12418 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12403 | hlo_id: 12403 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12446 | hlo_id: 12446 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12431 | hlo_id: 12431 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12474 | hlo_id: 12474 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12459 | hlo_id: 12459 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12502 | hlo_id: 12502 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12487 | hlo_id: 12487 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12530 | hlo_id: 12530 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12515 | hlo_id: 12515 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12558 | hlo_id: 12558 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12543 | hlo_id: 12543 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12586 | hlo_id: 12586 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12571 | hlo_id: 12571 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12614 | hlo_id: 12614 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12599 | hlo_id: 12599 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12642 | hlo_id: 12642 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12627 | hlo_id: 12627 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12670 | hlo_id: 12670 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12655 | hlo_id: 12655 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12698 | hlo_id: 12698 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12683 | hlo_id: 12683 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12726 | hlo_id: 12726 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12711 | hlo_id: 12711 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12754 | hlo_id: 12754 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12739 | hlo_id: 12739 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12782 | hlo_id: 12782 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12767 | hlo_id: 12767 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12810 | hlo_id: 12810 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12795 | hlo_id: 12795 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12838 | hlo_id: 12838 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12823 | hlo_id: 12823 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12866 | hlo_id: 12866 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12851 | hlo_id: 12851 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12894 | hlo_id: 12894 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12879 | hlo_id: 12879 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12922 | hlo_id: 12922 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12907 | hlo_id: 12907 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12950 | hlo_id: 12950 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12935 | hlo_id: 12935 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12978 | hlo_id: 12978 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12963 | hlo_id: 12963 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13006 | hlo_id: 13006 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.12991 | hlo_id: 12991 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13034 | hlo_id: 13034 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13019 | hlo_id: 13019 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13062 | hlo_id: 13062 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13047 | hlo_id: 13047 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13090 | hlo_id: 13090 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13075 | hlo_id: 13075 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13118 | hlo_id: 13118 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13103 | hlo_id: 13103 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13146 | hlo_id: 13146 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13131 | hlo_id: 13131 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13174 | hlo_id: 13174 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13159 | hlo_id: 13159 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13202 | hlo_id: 13202 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13187 | hlo_id: 13187 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13230 | hlo_id: 13230 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: unsupported partition shape for offset dge in tensor_op_name: _scatter.13215 | hlo_id: 13215 | +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.435 seconds +2025-08-07T13:55:05Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.338 seconds +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.066 seconds +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.180 seconds +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.061 seconds +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.021 seconds +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.080 seconds +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.188 seconds +2025-08-07T13:55:06Z INFO 48501 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-08-07T13:55:07Z INFO 48501 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-08-07T13:55:07Z INFO 48501 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 1.089 seconds +2025-08-07T13:55:07Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-08-07T13:55:08Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-08-07T13:55:08Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.335 seconds +2025-08-07T13:55:08Z INFO 48501 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-08-07T13:55:08Z INFO 48501 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=False) +2025-08-07T13:55:08Z INFO 48501 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.208 seconds +2025-08-07T13:55:08Z INFO 48501 [sg0000/Tensorizer/LICM]: Running LICM +2025-08-07T13:55:08Z INFO 48501 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-08-07T13:55:08Z INFO 48501 [sg0000/Tensorizer/LICM]: LICM finished after 0.111 seconds +2025-08-07T13:55:08Z INFO 48501 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 1.649 seconds +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.003 seconds +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=True) +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.026 seconds +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.036 seconds +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=True) +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.157 seconds +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.052 seconds +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.057 seconds +2025-08-07T13:55:10Z INFO 48501 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-08-07T13:55:11Z INFO 48501 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-08-07T13:55:11Z INFO 48501 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.511 seconds +2025-08-07T13:55:11Z INFO 48501 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-08-07T13:55:11Z INFO 48501 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-08-07T13:55:11Z INFO 48501 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.050 seconds +2025-08-07T13:55:11Z INFO 48501 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:55:11Z INFO 48501 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-08-07T13:55:11Z INFO 48501 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.290 seconds +2025-08-07T13:55:11Z INFO 48501 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:55:11Z INFO 48501 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-08-07T13:55:11Z INFO 48501 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.404 seconds +2025-08-07T13:55:11Z INFO 48501 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:55:13Z INFO 48501 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-08-07T13:55:13Z INFO 48501 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 1.732 seconds +2025-08-07T13:55:13Z INFO 48501 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:55:13Z INFO 48501 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-08-07T13:55:13Z INFO 48501 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.111 seconds +2025-08-07T13:55:13Z INFO 48501 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:55:13Z INFO 48501 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:13Z INFO 48501 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.215 seconds +2025-08-07T13:55:13Z INFO 48501 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-08-07T13:55:13Z INFO 48501 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-08-07T13:55:13Z INFO 48501 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.040 seconds +2025-08-07T13:55:13Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:55:14Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:55:14Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.049 seconds +2025-08-07T13:55:14Z INFO 48501 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-08-07T13:55:14Z INFO 48501 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=True) +2025-08-07T13:55:14Z INFO 48501 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.085 seconds +2025-08-07T13:55:14Z INFO 48501 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-08-07T13:55:14Z INFO 48501 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-08-07T13:55:14Z INFO 48501 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.022 seconds +2025-08-07T13:55:14Z INFO 48501 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-08-07T13:55:14Z INFO 48501 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-08-07T13:55:14Z INFO 48501 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.045 seconds +2025-08-07T13:55:14Z INFO 48501 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-08-07T13:55:14Z INFO 48501 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-08-07T13:55:14Z INFO 48501 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.346 seconds +2025-08-07T13:55:14Z INFO 48501 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-08-07T13:55:15Z INFO 48501 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-08-07T13:55:15Z INFO 48501 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 1.208 seconds +2025-08-07T13:55:15Z INFO 48501 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:55:16Z INFO 48501 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:55:16Z INFO 48501 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.356 seconds +2025-08-07T13:55:16Z INFO 48501 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-08-07T13:55:16Z INFO 48501 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-08-07T13:55:16Z INFO 48501 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.024 seconds +2025-08-07T13:55:16Z INFO 48501 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-08-07T13:55:16Z INFO 48501 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-08-07T13:55:16Z INFO 48501 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.403 seconds +2025-08-07T13:55:16Z INFO 48501 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:55:16Z INFO 48501 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:55:16Z INFO 48501 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.208 seconds +2025-08-07T13:55:16Z INFO 48501 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:55:17Z INFO 48501 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-08-07T13:55:17Z INFO 48501 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.392 seconds +2025-08-07T13:55:17Z INFO 48501 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:55:17Z INFO 48501 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:55:17Z INFO 48501 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.053 seconds +2025-08-07T13:55:17Z INFO 48501 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:55:17Z INFO 48501 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-08-07T13:55:17Z INFO 48501 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.498 seconds +2025-08-07T13:55:17Z INFO 48501 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-08-07T13:55:17Z INFO 48501 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-08-07T13:55:17Z INFO 48501 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.043 seconds +2025-08-07T13:55:17Z INFO 48501 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:55:18Z INFO 48501 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-08-07T13:55:18Z INFO 48501 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.407 seconds +2025-08-07T13:55:18Z INFO 48501 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:55:19Z INFO 48501 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-08-07T13:55:19Z INFO 48501 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 1.223 seconds +2025-08-07T13:55:19Z INFO 48501 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-08-07T13:55:19Z INFO 48501 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-08-07T13:55:19Z INFO 48501 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.058 seconds +2025-08-07T13:55:19Z INFO 48501 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:55:19Z INFO 48501 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-08-07T13:55:19Z INFO 48501 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.195 seconds +2025-08-07T13:55:19Z INFO 48501 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:55:19Z INFO 48501 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:55:19Z INFO 48501 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.296 seconds +2025-08-07T13:55:19Z INFO 48501 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:55:21Z INFO 48501 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-08-07T13:55:21Z INFO 48501 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 1.424 seconds +2025-08-07T13:55:21Z INFO 48501 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:55:21Z INFO 48501 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:55:21Z INFO 48501 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.060 seconds +2025-08-07T13:55:21Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:55:22Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:55:22Z INFO 48501 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 1.399 seconds +2025-08-07T13:55:22Z INFO 48501 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-08-07T13:55:23Z INFO 48501 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-08-07T13:55:23Z INFO 48501 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.153 seconds +2025-08-07T13:55:23Z INFO 48501 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-08-07T13:55:23Z INFO 48501 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=True) +2025-08-07T13:55:23Z INFO 48501 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.203 seconds +2025-08-07T13:55:23Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:55:23Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:55:23Z INFO 48501 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.182 seconds +2025-08-07T13:55:23Z INFO 48501 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:55:23Z INFO 48501 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:55:23Z INFO 48501 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.094 seconds +2025-08-07T13:55:23Z INFO 48501 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:55:24Z INFO 48501 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=True) +2025-08-07T13:55:24Z INFO 48501 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 1.362 seconds +2025-08-07T13:55:24Z INFO 48501 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:55:24Z INFO 48501 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:55:24Z INFO 48501 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.037 seconds +2025-08-07T13:55:24Z INFO 48501 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:55:25Z INFO 48501 [sg0000/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:55:25Z INFO 48501 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.163 seconds +2025-08-07T13:55:25Z INFO 48501 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:55:29Z INFO 48501 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:55:29Z INFO 48501 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 4.835 seconds +2025-08-07T13:55:29Z INFO 48501 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.326 seconds +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.196 seconds +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.070 seconds +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 2.705ms (594.000MiB, est bw: 230.258GB/s, 7.668% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (594, 128, 4096) %'36446.52149'[i4422_0,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (75968, 4096) %'input473'[128i4422_0+i0.128,i1.4096] # id=52148, src_id=None, , instances=594 # dl = tensor_op_name: input473_pftranspose_36446 | hlo_id: 20004 | if -128i4422_0-i0.128+75967 >= 0 [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.656% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input84_local_38595'[i148_0,i147_0_0_38599,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input84'[i148_0,i147_0_0_38599,i0.128,i1.3072] # id=42944, src_id=None, , instances=64 # dl = tensor_op_name: _dot.395 | hlo_id: 15976 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.656% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input95_local_38671'[i270_0,i269_0_0_38675,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input95'[i270_0,i269_0_0_38675,i0.128,i1.3072] # id=43122, src_id=None, , instances=64 # dl = tensor_op_name: _dot.727 | hlo_id: 16091 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.656% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input106_local_38747'[i392_0,i391_0_0_38751,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input106'[i392_0,i391_0_0_38751,i0.128,i1.3072] # id=43300, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1059 | hlo_id: 16206 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.656% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input117_local_38823'[i514_0,i513_0_0_38827,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input117'[i514_0,i513_0_0_38827,i0.128,i1.3072] # id=43478, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1391 | hlo_id: 16321 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.656% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input128_local_38899'[i636_0,i635_0_0_38903,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input128'[i636_0,i635_0_0_38903,i0.128,i1.3072] # id=43656, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1723 | hlo_id: 16436 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.656% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input139_local_38975'[i758_0,i757_0_0_38979,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input139'[i758_0,i757_0_0_38979,i0.128,i1.3072] # id=43834, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2055 | hlo_id: 16551 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.656% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input150_local_39051'[i880_0,i879_0_0_39055,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input150'[i880_0,i879_0_0_39055,i0.128,i1.3072] # id=44012, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2387 | hlo_id: 16666 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.656% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input161_local_39127'[i1002_0,i1001_0_0_39131,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input161'[i1002_0,i1001_0_0_39131,i0.128,i1.3072] # id=44190, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2719 | hlo_id: 16781 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 231.410us (48.000MiB, est bw: 217.500GB/s, 0.656% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (32, 2, 128, 3072) %'input172_local_39203'[i1124_0,i1123_0_0_39207,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (32, 2, 128, 3072) %'input172'[i1124_0,i1123_0_0_39207,i0.128,i1.3072] # id=44368, src_id=None, , instances=64 # dl = tensor_op_name: _dot.3051 | hlo_id: 16896 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.091 seconds +2025-08-07T13:55:30Z INFO 48501 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.001 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.004 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.000 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.006 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.001 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.001 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.001 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.000 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.000 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-08-07T13:55:30Z INFO 48501 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48501 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-08-07T13:55:31Z INFO 48501 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-08-07T13:55:31Z INFO 48501 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.491 seconds +2025-08-07T13:55:31Z INFO 48501 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-08-07T13:55:31Z INFO 48501 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-08-07T13:55:31Z INFO 48501 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.475 seconds +2025-08-07T13:55:31Z INFO 48501 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-08-07T13:55:31Z WARNING 48501 [sg0000/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 91.48 percent of all matmul computation +2025-08-07T13:55:31Z INFO 48501 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-08-07T13:55:31Z INFO 48501 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.141 seconds +2025-08-07T13:55:31Z INFO 48501 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-08-07T13:55:32Z INFO 48501 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-08-07T13:55:32Z INFO 48501 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.360 seconds +2025-08-07T13:55:32Z INFO 48501 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-08-07T13:55:32Z INFO 48501 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-08-07T13:55:32Z INFO 48501 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.098 seconds +2025-08-07T13:55:32Z INFO 48501 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-08-07T13:55:32Z INFO 48501 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-08-07T13:55:32Z INFO 48501 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.253 seconds +2025-08-07T13:55:32Z INFO 48501 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-08-07T13:55:32Z INFO 48501 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-08-07T13:55:32Z INFO 48501 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.001 seconds +2025-08-07T13:55:32Z INFO 48501 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-08-07T13:55:34Z INFO 48501 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-08-07T13:55:34Z INFO 48501 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 2.279 seconds +2025-08-07T13:55:36Z INFO 48501 [Tensorizer]: BirCodeGen estimate #instances=327134 in sg0000 +2025-08-07T13:55:36Z INFO 48501 [Tensorizer]: IR signature: b2cbbeb4727abf8d94548edb911c6abea59fa6044113313ec7ac7e59d8987dfb for nc00/sg0000/TensorizerBIR +2025-08-07T13:55:36Z INFO 48501 [Tensorizer]: Weights total number of bytes: 4952584 +2025-08-07T13:55:36Z INFO 48501 [Tensorizer]: Successfully built model. +2025-08-07T13:55:36Z USER 48501 [root/Tensorizer/Tensorizer]: Tensorizer finished after 104.299 seconds +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: End tensorization +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input0 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input1 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input2 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input3 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input4 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input5 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input6 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input7 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input8 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input9 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input10 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input11 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input12 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input13 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input14 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input15 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input16 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input17 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input18 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input19 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input20 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input21 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input22 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input23 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input24 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input25 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input26 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input27 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input28 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input29 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input30 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input31 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input32 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input33 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input34 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input35 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input36 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input37 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input38 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input39 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input40 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input41 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input42 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input43 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input44 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input45 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input46 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input47 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input48 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input49 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input50 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input51 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input52 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input53 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input54 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input55 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input56 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input57 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input58 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input59 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input60 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input61 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input62 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input63 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input64 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input65 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input66 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input67 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input68 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input69 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input70 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input71 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input72 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input73 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input74 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input75 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input76 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input77 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input78 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input79 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input80 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input81 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input82 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input83 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input84 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input85 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input86 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input87 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input88 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input89 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input90 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input91 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input92 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input93 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input94 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input95 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input96 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input97 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input98 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input99 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input100 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input101 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input102 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input103 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input104 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input105 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input106 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input107 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input108 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input109 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input110 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input111 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input112 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input113 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input114 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input115 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input116 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input117 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input118 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input119 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input120 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input121 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input122 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input123 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input124 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input125 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input126 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input127 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input128 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input129 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input130 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input131 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input132 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input133 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input134 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input135 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input136 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input137 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input138 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input139 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input140 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input141 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input142 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input143 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input144 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input145 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input146 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input147 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input148 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input149 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input150 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input151 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input152 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input153 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input154 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input155 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input156 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input157 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input158 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input159 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input160 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input161 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input162 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input163 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input164 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input165 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input166 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input167 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input168 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input169 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input170 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input171 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input172 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input173 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input174 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input175 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input176 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input177 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input178 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input179 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input180 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input181 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input182 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input183 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input184 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input185 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input186 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input187 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input188 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input189 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input190 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input191 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input192 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input193 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input194 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input195 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input196 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input197 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input198 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input199 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input200 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input201 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input202 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input203 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input204 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input205 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input206 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input207 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input208 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input209 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input210 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input211 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input212 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input213 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input214 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input215 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input216 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input217 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input218 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input219 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input220 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input221 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input222 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input223 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input224 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input225 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input226 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input227 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input228 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input229 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input230 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input231 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input232 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input233 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input234 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input235 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input236 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input237 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input238 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input239 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input240 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input241 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input242 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input243 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input244 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input245 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input246 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input247 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input248 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input249 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input250 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input251 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input252 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input253 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input254 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input255 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input256 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input257 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input258 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input259 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input260 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input261 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input262 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input263 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input264 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input265 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input266 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input267 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input268 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input269 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input270 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input271 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input272 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input273 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input274 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input275 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input276 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input277 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input278 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input279 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input280 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input281 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input282 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input283 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input284 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input285 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input286 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input287 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input288 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input289 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input290 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input291 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input292 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input293 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input294 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input295 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input296 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input297 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input298 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input299 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input300 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input301 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input302 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input303 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input304 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input305 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input306 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input307 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input308 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input309 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input310 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input311 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input312 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input313 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input314 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input315 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input316 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input317 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input318 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input319 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input320 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input321 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input322 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input323 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input324 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input325 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input326 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input327 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input328 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input329 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input330 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input331 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input332 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input333 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input334 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input335 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input336 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input337 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input338 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input339 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input340 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input341 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input342 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input343 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input344 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input345 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input346 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input347 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input348 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input349 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input350 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input351 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input352 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input353 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input354 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input355 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input356 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input357 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input358 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input359 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input360 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input361 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input362 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input363 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input364 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input365 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input366 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input367 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input368 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input369 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input370 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input371 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input372 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input373 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input374 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input375 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input376 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input377 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input378 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input379 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input380 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input381 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input382 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input383 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input384 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input385 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input386 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input387 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input388 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input389 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input390 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input391 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input392 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input393 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input394 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input395 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input396 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input397 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input398 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input399 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input400 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input401 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input402 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input403 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input404 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input405 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input406 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input407 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input408 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input409 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input410 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input411 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input412 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input413 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input414 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input415 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input416 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input417 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input418 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input419 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input420 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input421 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input422 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input423 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input424 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input425 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input426 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input427 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input428 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input429 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input430 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input431 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input432 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input433 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input434 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input435 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input436 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input437 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input438 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input439 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input440 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input441 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input442 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input443 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input444 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input445 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input446 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input447 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input448 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input449 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input450 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input451 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input452 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input453 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input454 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input455 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input456 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input457 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input458 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input459 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input460 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input461 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input462 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input463 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input464 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input465 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input466 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input467 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input468 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input469 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input470 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input471 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input472 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input473 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Network input: input474 +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: wrote bir.json +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: wrote tensor_map.json +2025-08-07T13:55:36Z INFO 48501 [job.Frontend.0]: Job #0 finished +2025-08-07T13:55:36Z INFO 48501 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-08-07T13:55:36Z INFO 48501 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-08-07T13:55:36Z INFO 48501 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-08-07T13:55:36Z INFO 48501 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-08-07T13:55:36Z INFO 48501 [job.WalrusDriver.0]: BackendDriver has 1 states with 1 core LNC +2025-08-07T13:55:36Z INFO 48501 [job.WalrusDriver.0]: BackendDriver: no partitions found. Switching to flat flow. +2025-08-07T13:55:36Z INFO 48501 [job.WalrusDriver.0]: Job WalrusDriver len(in_states) 1 +2025-08-07T13:55:36Z INFO 48501 [job.WalrusDriver.0]: Processing input #0 +2025-08-07T13:55:36Z INFO 48501 [job.WalrusDriver.0]: BackendDriver in_state.num_states 1 with 1 core LNC +2025-08-07T13:55:36Z INFO 48501 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/token_generation_model/_tp0_bk3/log-neuron-cc.txt --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-color-rotation --max-load-lower-bound 0.14 --mm-reorder-opt --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json --unified-backend-and-legacy-codegen --tensor-map tensor_map.json --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --dge-levels vector_dynamic_offsets,io,scalar_dynamic_offset --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.neff +2025-08-07T13:55:36Z INFO 48501 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/token_generation_model/_tp0_bk3/neuronxcc-m7dyulmn/sg00 +2025-08-07T13:55:36Z INFO 48501 [job.WalrusDriver.0]: propagate_exit=True +2025-08-07T13:55:36Z INFO 48501 [job.WalrusDriver.0]: use_logger=False +2025-08-07T13:55:36Z INFO 48501 [job.WalrusDriver.0]: expose_stderr=True +2025-08-07T13:55:36Z INFO 49724 [Logging]: Logging to ../../log-neuron-cc.txt at level 'INFO' +2025-08-07T13:55:36Z INFO 49724 [BackendDriver]: max_allowed_parallelism=128 +2025-08-07T13:55:37Z INFO 49724 [BackendDriver]: Backend driver mtBackend: false numModules: 1 Cwd: "/home/ubuntu/qwen3/token_generation_model/_tp0_bk3/neuronxcc-m7dyulmn/sg00" +2025-08-07T13:55:37Z INFO 49724 [BackendDriver]: DynamicDMA is enabled +2025-08-07T13:55:37Z INFO 49724 [BackendDriver]: DynamicDMA levels being enabled: io, scalar_dynamic_offset, vector_dynamic_offsets, +2025-08-07T13:55:37Z USER 49724 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:55:37Z INFO 49724 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=7410 blocks=1 instructions=7478 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:37Z USER 49724 [ModuleForkPass]: Running do_nothing +2025-08-07T13:55:37Z INFO 49724 [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=7410 blocks=1 instructions=7478 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:37Z USER 49724 [ModuleForkPass]: do_nothing finished after 0.001 seconds +2025-08-07T13:55:37Z INFO 49724 [ModuleForkPass]: curr_vmrss: 218mb, ru_maxrss: 702mb (delta=0mb) +2025-08-07T13:55:37Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 7410 memory location(s), 1 block(s), and 7478 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:55:37Z USER 49724 [ModuleForkPass]: Running birverifier +2025-08-07T13:55:37Z INFO 49724 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=7410 blocks=1 instructions=7478 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:37Z WARNING 49724 [birverifier::InstVisitor]: (module) Non - output memory location with no reader: {convert.345.62687}@SB<0,0>(1x2)#Internal DebugInfo: +2025-08-07T13:55:37Z USER 49724 [ModuleForkPass]: birverifier finished after 0.208 seconds +2025-08-07T13:55:37Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1018mb, ru_maxrss: 1018mb (delta=316mb) +2025-08-07T13:55:37Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 7410 memory location(s), 1 block(s), and 7478 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:55:37Z USER 49724 [BackendPassManager]: mod_parallel_pass finished after 0.214 seconds +2025-08-07T13:55:37Z INFO 49724 [BackendPassManager]: curr_vmrss: 1010mb, ru_maxrss: 1018mb (delta=316mb) +2025-08-07T13:55:37Z INFO 49724 [BackendPassManager]: Output has 1 module(s), 1 function(s), 7410 memory location(s), 1 block(s), and 7478 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:55:37Z USER 49724 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:55:37Z INFO 49724 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=7410 blocks=1 instructions=7478 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:37Z USER 49724 [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:55:37Z INFO 49724 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=7410 blocks=1 instructions=7478 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:37Z USER 49724 [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:55:37Z INFO 49724 [SubgraphForkPass]: curr_vmrss: 1010mb, ru_maxrss: 1018mb (delta=0mb) +2025-08-07T13:55:37Z INFO 49724 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 7410 memory location(s), 1 block(s), and 7478 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:55:37Z USER 49724 [BackendPassManager]: subgraph_parallel_pass finished after 0.002 seconds +2025-08-07T13:55:37Z INFO 49724 [BackendPassManager]: curr_vmrss: 1010mb, ru_maxrss: 1018mb (delta=0mb) +2025-08-07T13:55:37Z INFO 49724 [BackendPassManager]: Output has 1 module(s), 1 function(s), 7410 memory location(s), 1 block(s), and 7478 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:55:37Z USER 49724 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:55:37Z INFO 49724 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=7410 blocks=1 instructions=7478 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:37Z USER 49724 [ModuleForkPass]: Running expand_replication +2025-08-07T13:55:37Z INFO 49724 [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=7410 blocks=1 instructions=7478 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:37Z INFO 49724 [ExpandReplication]: Found 0 replicated matmults +2025-08-07T13:55:37Z USER 49724 [ModuleForkPass]: expand_replication finished after 0.001 seconds +2025-08-07T13:55:37Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1010mb, ru_maxrss: 1018mb (delta=0mb) +2025-08-07T13:55:37Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 7410 memory location(s), 1 block(s), and 7478 instruction(s). Max writers: 191 Max Readers: 475 +2025-08-07T13:55:37Z USER 49724 [ModuleForkPass]: Running unroll +2025-08-07T13:55:37Z INFO 49724 [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=7410 blocks=1 instructions=7478 Max writers: 191 Max Readers: 475 +2025-08-07T13:55:37Z INFO 49724 [Unroll]: INFO (Unroll) Start unrolling at Thu Aug 7 13:55:37 2025 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: INFO (Unroll) DONE unrolling Thu Aug 7 13:55:37 2025 + +2025-08-07T13:55:40Z INFO 49724 [Unroll]: sg0000 Instruction count after Unroll: +2025-08-07T13:55:40Z INFO 49724 [Unroll]: Total count: 281317 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: Matmult: 255192 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: GenericCopy: 11855 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: Load: 8260 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: TensorTensor: 1341 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: TensorScalarPtr: 1338 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: Save: 682 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: Activation: 545 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: StreamShuffle: 510 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: Memset: 336 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: Max: 224 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: MaxIndex: 224 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: MatchReplace: 217 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: TensorReduce: 187 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: CollectiveCompute: 75 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: Reciprocal: 75 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: DMACopy: 74 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: Iota: 73 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: Select: 38 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: StreamTranspose: 36 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: Gather: 35 +2025-08-07T13:55:40Z INFO 49724 [Unroll]: Unrolled DGE count with Dynamic AP: 73 +2025-08-07T13:55:40Z USER 49724 [ModuleForkPass]: unroll finished after 2.629 seconds +2025-08-07T13:55:40Z INFO 49724 [ModuleForkPass]: curr_vmrss: 2417mb, ru_maxrss: 2417mb (delta=1399mb) +2025-08-07T13:55:40Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 29141 memory location(s), 1 block(s), and 281317 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [BackendPassManager]: mod_parallel_pass finished after 2.687 seconds +2025-08-07T13:55:40Z INFO 49724 [BackendPassManager]: curr_vmrss: 1505mb, ru_maxrss: 2417mb (delta=1399mb) +2025-08-07T13:55:40Z INFO 49724 [BackendPassManager]: Output has 1 module(s), 1 function(s), 29141 memory location(s), 1 block(s), and 281317 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:55:40Z INFO 49724 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=29141 blocks=1 instructions=281317 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:55:40Z INFO 49724 [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=29141 blocks=1 instructions=281317 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z INFO 49724 [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:55:40Z INFO 49724 [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:40Z INFO 49724 [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:40Z INFO 49724 [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:55:40Z USER 49724 [SubgraphForkPass]: dead_code_elim finished after 0.293 seconds +2025-08-07T13:55:40Z INFO 49724 [SubgraphForkPass]: curr_vmrss: 1513mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:40Z INFO 49724 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 28520 memory location(s), 1 block(s), and 281316 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [BackendPassManager]: subgraph_parallel_pass finished after 0.297 seconds +2025-08-07T13:55:40Z INFO 49724 [BackendPassManager]: curr_vmrss: 1513mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:40Z INFO 49724 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28520 memory location(s), 1 block(s), and 281316 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:55:40Z INFO 49724 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=28520 blocks=1 instructions=281316 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [ModuleForkPass]: Running birverifier +2025-08-07T13:55:40Z INFO 49724 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=28520 blocks=1 instructions=281316 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [ModuleForkPass]: birverifier finished after 0.263 seconds +2025-08-07T13:55:40Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1526mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:40Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28520 memory location(s), 1 block(s), and 281316 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [BackendPassManager]: mod_parallel_pass finished after 0.268 seconds +2025-08-07T13:55:40Z INFO 49724 [BackendPassManager]: curr_vmrss: 1526mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:40Z INFO 49724 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28520 memory location(s), 1 block(s), and 281316 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:55:40Z INFO 49724 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=28520 blocks=1 instructions=281316 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:55:40Z INFO 49724 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=28520 blocks=1 instructions=281316 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:55:40Z INFO 49724 [SubgraphForkPass]: curr_vmrss: 1526mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:40Z INFO 49724 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 28520 memory location(s), 1 block(s), and 281316 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [BackendPassManager]: subgraph_parallel_pass finished after 0.004 seconds +2025-08-07T13:55:40Z INFO 49724 [BackendPassManager]: curr_vmrss: 1526mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:40Z INFO 49724 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28520 memory location(s), 1 block(s), and 281316 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:55:40Z INFO 49724 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=28520 blocks=1 instructions=281316 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [ModuleForkPass]: Running instruction_reorder +2025-08-07T13:55:40Z INFO 49724 [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=28520 blocks=1 instructions=281316 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [ModuleForkPass]: instruction_reorder finished after 0.046 seconds +2025-08-07T13:55:40Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1526mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:40Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28520 memory location(s), 1 block(s), and 281316 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [ModuleForkPass]: Running psum_legalization +2025-08-07T13:55:40Z INFO 49724 [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=28520 blocks=1 instructions=281316 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [ModuleForkPass]: psum_legalization finished after 0.023 seconds +2025-08-07T13:55:40Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1526mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:40Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28520 memory location(s), 1 block(s), and 281316 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [ModuleForkPass]: Running legalize_cce_dma +2025-08-07T13:55:40Z INFO 49724 [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=28520 blocks=1 instructions=281316 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [ModuleForkPass]: legalize_cce_dma finished after 0.026 seconds +2025-08-07T13:55:40Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1526mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:40Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28520 memory location(s), 1 block(s), and 281316 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [ModuleForkPass]: Running error_injector +2025-08-07T13:55:40Z INFO 49724 [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=28520 blocks=1 instructions=281316 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z WARNING 49724 [ErrorInjector]: Unrecognized injected error value "0" +2025-08-07T13:55:40Z USER 49724 [ModuleForkPass]: error_injector finished after 0.001 seconds +2025-08-07T13:55:40Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1526mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:40Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28520 memory location(s), 1 block(s), and 281316 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z USER 49724 [ModuleForkPass]: Running vn_splitter +2025-08-07T13:55:40Z INFO 49724 [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=28520 blocks=1 instructions=281316 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:40Z INFO 49724 [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 14 +2025-08-07T13:55:40Z INFO 49724 [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-08-07T13:55:40Z INFO 49724 [ShrinkDN]: INFO (ShrinkDN): Shrunk 3 nodes. Total savings 14456 bytes/partition +2025-08-07T13:55:41Z INFO 49724 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-08-07T13:55:41Z INFO 49724 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-08-07T13:55:41Z INFO 49724 [VNSplitterPass]: INFO (VNSplitter) Time: 0.001 seconds +2025-08-07T13:55:41Z INFO 49724 [VNSplitterPass]: INFO (VerticalFusion) Time: 0.033 seconds +2025-08-07T13:55:41Z INFO 49724 [VNSplitterPass]: INFO (ShrinkDN) Time: 0.044 seconds +2025-08-07T13:55:41Z USER 49724 [ModuleForkPass]: vn_splitter finished after 0.127 seconds +2025-08-07T13:55:41Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1530mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28520 memory location(s), 1 block(s), and 281316 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:41Z USER 49724 [ModuleForkPass]: Running constant_propagate +2025-08-07T13:55:41Z INFO 49724 [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=28520 blocks=1 instructions=281316 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:41Z INFO 49724 [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-08-07T13:55:41Z INFO 49724 [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:55:41Z INFO 49724 [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:41Z INFO 49724 [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:41Z INFO 49724 [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:55:41Z INFO 49724 [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-08-07T13:55:41Z INFO 49724 [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-08-07T13:55:41Z INFO 49724 [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:41Z INFO 49724 [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:41Z INFO 49724 [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:55:41Z USER 49724 [ModuleForkPass]: constant_propagate finished after 0.634 seconds +2025-08-07T13:55:41Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1532mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28520 memory location(s), 1 block(s), and 281316 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:41Z USER 49724 [ModuleForkPass]: Running lower_ac +2025-08-07T13:55:41Z INFO 49724 [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=28520 blocks=1 instructions=281316 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:41Z INFO 49724 [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-08-07T13:55:41Z USER 49724 [ModuleForkPass]: lower_ac finished after 0.041 seconds +2025-08-07T13:55:41Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1532mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28520 memory location(s), 1 block(s), and 281316 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:41Z USER 49724 [ModuleForkPass]: Running input_dma_coalescing +2025-08-07T13:55:41Z INFO 49724 [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=28520 blocks=1 instructions=281316 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:41Z INFO 49724 [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-08-07T13:55:41Z USER 49724 [ModuleForkPass]: input_dma_coalescing finished after 0.080 seconds +2025-08-07T13:55:41Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1532mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28520 memory location(s), 1 block(s), and 281316 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:41Z USER 49724 [ModuleForkPass]: Running remat_optimization +2025-08-07T13:55:41Z INFO 49724 [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=28520 blocks=1 instructions=281316 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:41Z INFO 49724 [RematOpt]: Removed 0 remat instructions +2025-08-07T13:55:41Z USER 49724 [ModuleForkPass]: remat_optimization finished after 0.155 seconds +2025-08-07T13:55:41Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1535mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:41Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28520 memory location(s), 1 block(s), and 281316 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:41Z USER 49724 [ModuleForkPass]: Running early_peephole_opts +2025-08-07T13:55:41Z INFO 49724 [ModuleForkPass]: Inputs to early_peephole_opts: modules=1 functions=1 allocs=28520 blocks=1 instructions=281316 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:41Z INFO 49724 [EarlyPeepholeOpts]: PeepholeOpts enabled? ActivationAccumulate: true +2025-08-07T13:55:42Z INFO 49724 [EarlyPeepholeOpts]: Activation Accumulate: 0 +2025-08-07T13:55:42Z USER 49724 [ModuleForkPass]: early_peephole_opts finished after 0.088 seconds +2025-08-07T13:55:42Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1535mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:42Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28520 memory location(s), 1 block(s), and 281316 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:42Z USER 49724 [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-08-07T13:55:42Z INFO 49724 [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=28520 blocks=1 instructions=281316 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:42Z USER 49724 [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.022 seconds +2025-08-07T13:55:42Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1535mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:42Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28520 memory location(s), 1 block(s), and 281316 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:42Z USER 49724 [ModuleForkPass]: Running infer_stream_ids +2025-08-07T13:55:42Z INFO 49724 [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=28520 blocks=1 instructions=281316 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:42Z USER 49724 [ModuleForkPass]: infer_stream_ids finished after 0.022 seconds +2025-08-07T13:55:42Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1535mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:42Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28520 memory location(s), 1 block(s), and 281316 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:42Z USER 49724 [ModuleForkPass]: Running pre_sched +2025-08-07T13:55:42Z INFO 49724 [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=28520 blocks=1 instructions=281316 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:42Z INFO 49724 [PreSched]: Start PRE scheduling 2 cores: 1 at: Thu Aug 7 13:55:42 2025 +2025-08-07T13:55:42Z INFO 49724 [LayerSpiller]: LayerSpill: Start... +2025-08-07T13:55:42Z INFO 49724 [LayerSpiller]: LayerSpill: Found 72 Splits CCs +2025-08-07T13:55:42Z INFO 49724 [LayerSpiller]: Grouped CCs to 72 clusters. +2025-08-07T13:55:42Z INFO 49724 [LayerSpiller]: LayerSpill: To Spill 60 multi-layer tensors +2025-08-07T13:55:42Z INFO 49724 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-08-07T13:55:42Z INFO 49724 [LayerSpiller]: LayerSpill: Done. +2025-08-07T13:55:42Z INFO 49724 [PreSched]: Start split live ranges Thu Aug 7 13:55:42 2025 +2025-08-07T13:55:42Z INFO 49724 [PreSched]: Num_Splits: 0 +2025-08-07T13:55:42Z INFO 49724 [PreSched]: End split live ranges Thu Aug 7 13:55:42 2025 +2025-08-07T13:55:42Z INFO 49724 [PreSched]: Strt remove redundncies Thu Aug 7 13:55:42 2025 +2025-08-07T13:55:42Z INFO 49724 [PreSched]: remove_redundant_memsets +2025-08-07T13:55:42Z INFO 49724 [PreSched]: remove_redundant_memsets: 36 +2025-08-07T13:55:42Z INFO 49724 [PreSched]: remove_redundant_loads +2025-08-07T13:55:42Z INFO 49724 [PreSched]: remove_redundant_loads: 0 +2025-08-07T13:55:42Z INFO 49724 [PreSched]: End remove redundncies Thu Aug 7 13:55:42 2025 +2025-08-07T13:55:42Z INFO 49724 [PreSched]: Start DCE Thu Aug 7 13:55:42 2025 +2025-08-07T13:55:42Z INFO 49724 [PreSched]: eliminateDeadStore removed 0 instructions +2025-08-07T13:55:42Z INFO 49724 [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:42Z INFO 49724 [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:42Z INFO 49724 [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:55:42Z INFO 49724 [PreSched]: End DCE Thu Aug 7 13:55:42 2025 +2025-08-07T13:55:42Z INFO 49724 [PreSched]: Start build flow dependencies Thu Aug 7 13:55:42 2025 +2025-08-07T13:55:42Z INFO 49724 [build_flow_deps]: Start build fdeps. Invocation: 1Thu Aug 7 13:55:42 2025 +2025-08-07T13:55:42Z INFO 49724 [build_flow_deps]: Allocs: 28640 instructions: 281400 +2025-08-07T13:55:43Z INFO 49724 [build_flow_deps]: Build fdeps inserted 827835 edges +2025-08-07T13:55:43Z INFO 49724 [build_flow_deps]: Done build fdeps 827835 Thu Aug 7 13:55:43 2025 +2025-08-07T13:55:43Z INFO 49724 [PreSched]: End build flow dependencies Thu Aug 7 13:55:43 2025 +2025-08-07T13:55:43Z INFO 49724 [PreSched]: Start remove useless insts Thu Aug 7 13:55:43 2025 +2025-08-07T13:55:43Z INFO 49724 [PreSched]: remove_useless_insts +2025-08-07T13:55:43Z INFO 49724 [PreSched]: remove Useless Instructions: 0 +2025-08-07T13:55:43Z INFO 49724 [PreSched]: End remove useless insts Thu Aug 7 13:55:43 2025 +2025-08-07T13:55:43Z INFO 49724 [PreSched]: Start scratchpad optimization Thu Aug 7 13:55:43 2025 +2025-08-07T13:55:43Z INFO 49724 [PreSched]: End scratchpad optimization Thu Aug 7 13:55:43 2025 +2025-08-07T13:55:43Z INFO 49724 [PreSched]: DONE PRE scheduling Thu Aug 7 13:55:43 2025 +2025-08-07T13:55:43Z USER 49724 [ModuleForkPass]: pre_sched finished after 1.884 seconds +2025-08-07T13:55:43Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1684mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:43Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28640 memory location(s), 1 block(s), and 281400 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:43Z USER 49724 [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:55:43Z INFO 49724 [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=28640 blocks=1 instructions=281400 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:44Z INFO 49724 [TensorCopyElim]: Tensor CP elimination: 1 +2025-08-07T13:55:44Z INFO 49724 [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:55:44Z INFO 49724 [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:44Z INFO 49724 [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-08-07T13:55:44Z INFO 49724 [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-08-07T13:55:44Z USER 49724 [ModuleForkPass]: tensor_copy_elim finished after 0.392 seconds +2025-08-07T13:55:44Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1684mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:44Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28639 memory location(s), 1 block(s), and 281399 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:44Z USER 49724 [ModuleForkPass]: Running dynamic_dma_setup +2025-08-07T13:55:44Z INFO 49724 [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=28639 blocks=1 instructions=281399 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:44Z USER 49724 [ModuleForkPass]: dynamic_dma_setup finished after 0.001 seconds +2025-08-07T13:55:44Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1684mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:44Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28640 memory location(s), 1 block(s), and 281399 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:44Z USER 49724 [ModuleForkPass]: Running runtime_memory_reservation +2025-08-07T13:55:44Z INFO 49724 [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=28640 blocks=1 instructions=281399 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:44Z USER 49724 [ModuleForkPass]: runtime_memory_reservation finished after 0.001 seconds +2025-08-07T13:55:44Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1684mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:44Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28640 memory location(s), 1 block(s), and 281399 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:44Z USER 49724 [ModuleForkPass]: Running coloring_allocator_psum +2025-08-07T13:55:44Z INFO 49724 [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=28640 blocks=1 instructions=281399 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:44Z INFO 49724 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:55:44Z INFO 49724 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:55:44Z INFO 49724 [PSUM_Allocator]: allocating PSUM +2025-08-07T13:55:44Z INFO 49724 [PSUM_Allocator]: main loop +2025-08-07T13:55:44Z INFO 49724 [PSUM_Allocator]: renumber locations +2025-08-07T13:55:44Z INFO 49724 [PSUM_Allocator]: size = 12102 +2025-08-07T13:55:44Z INFO 49724 [PSUM_Allocator]: build_no_bitmap start +2025-08-07T13:55:44Z INFO 49724 [PSUM_Allocator]: 100% PSUM demand before spilling +2025-08-07T13:55:44Z INFO 49724 [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-08-07T13:55:44Z INFO 49724 [PSUM_Allocator]: found 23077 edges +2025-08-07T13:55:44Z INFO 49724 [PSUM_Allocator]: mean: 3.81375 +2025-08-07T13:55:44Z INFO 49724 [PSUM_Allocator]: median: 2.36817 +2025-08-07T13:55:44Z INFO 49724 [PSUM_Allocator]: adjacency vectors require 184616 bytes +2025-08-07T13:55:44Z INFO 49724 [PSUM_Allocator]: build_no_bitmap done +2025-08-07T13:55:44Z INFO 49724 [PSUM_Allocator]: find costs +2025-08-07T13:55:49Z INFO 49724 [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-08-07T13:55:49Z INFO 49724 [PSUM_Allocator]: simplify interference graph +2025-08-07T13:55:49Z INFO 49724 [PSUM_Allocator]: initialize low and high +2025-08-07T13:55:49Z INFO 49724 [PSUM_Allocator]: lo = 12102 +2025-08-07T13:55:49Z INFO 49724 [PSUM_Allocator]: hi = 0 +2025-08-07T13:55:49Z INFO 49724 [PSUM_Allocator]: inf = 0 +2025-08-07T13:55:49Z INFO 49724 [PSUM_Allocator]: total = 12102 +2025-08-07T13:55:49Z INFO 49724 [PSUM_Allocator]: simplify +2025-08-07T13:55:49Z INFO 49724 [PSUM_Allocator]: new candidates = 0 +2025-08-07T13:55:49Z INFO 49724 [PSUM_Allocator]: select ranges +2025-08-07T13:55:50Z INFO 49724 [PSUM_Allocator]: no more spills +2025-08-07T13:55:50Z INFO 49724 [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-08-07T13:55:50Z INFO 49724 [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-08-07T13:55:50Z INFO 49724 [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-08-07T13:55:50Z USER 49724 [ModuleForkPass]: coloring_allocator_psum finished after 5.662 seconds +2025-08-07T13:55:50Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1688mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:50Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28640 memory location(s), 1 block(s), and 281399 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:50Z USER 49724 [ModuleForkPass]: Running dma_optimization_psum +2025-08-07T13:55:50Z INFO 49724 [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=28640 blocks=1 instructions=281399 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:50Z INFO 49724 [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-08-07T13:55:50Z INFO 49724 [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-08-07T13:55:50Z USER 49724 [ModuleForkPass]: dma_optimization_psum finished after 0.176 seconds +2025-08-07T13:55:50Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1688mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:50Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28640 memory location(s), 1 block(s), and 281399 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:50Z USER 49724 [ModuleForkPass]: Running address_rotation_psum +2025-08-07T13:55:50Z INFO 49724 [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=28640 blocks=1 instructions=281399 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:50Z INFO 49724 [DMAOptimizationBase]: PSUM Rotation rotated 760 PSUM Banks +2025-08-07T13:55:50Z INFO 49724 [DMAOptimizationBase]: PSUM Rotation rotated 156 PSUM Banks +2025-08-07T13:55:51Z INFO 49724 [DMAOptimizationBase]: PSUM Rotation rotated 452 PSUM Banks +2025-08-07T13:55:51Z USER 49724 [ModuleForkPass]: address_rotation_psum finished after 1.060 seconds +2025-08-07T13:55:51Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1693mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:55:51Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28640 memory location(s), 1 block(s), and 281399 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:51Z USER 49724 [ModuleForkPass]: Running coloring_allocator_sb +2025-08-07T13:55:51Z INFO 49724 [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=28640 blocks=1 instructions=281399 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:55:51Z INFO 49724 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 7649430086 +2025-08-07T13:55:51Z INFO 49724 [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 7376 bytes +2025-08-07T13:55:51Z INFO 49724 [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 2812938 +2025-08-07T13:55:51Z INFO 49724 [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 397 bytes +2025-08-07T13:55:51Z INFO 49724 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 78980 +2025-08-07T13:55:51Z INFO 49724 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 136 bytes +2025-08-07T13:55:51Z INFO 49724 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:55:51Z INFO 49724 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: allocating SB +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: main loop +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: renumber locations +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: size = 15686 +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: find partners +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: found 11811 accumulation groups +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: largest = _dot.10343-t42504 +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: tensors = 49 +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: requires 393280 bytes/partition +2025-08-07T13:55:51Z WARNING 49724 [SB_Allocator]: accumulation group is too large for SB +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: expanding partners +2025-08-07T13:55:51Z INFO 49724 []: find first defs for local +2025-08-07T13:55:51Z INFO 49724 []: find first defs for global +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: find loads +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: 1 pin count +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: 8233 remat count +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: 1 pinned tensors will require about 16384 bytes/partition +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: build interference graph +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: pass 1 int-tree +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: Num intervals 15686 Num locations 15686 +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: IntervalTree Build Done +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: info.neighbors init Done +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: info.neighbors partners Done +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: IntervalTree readback Done +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: edge: 157594 +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: mean: 20.0936 +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: median: 10.861 +2025-08-07T13:55:51Z INFO 49724 [SB_Allocator]: find costs +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: simplify interference graph +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: initialize safe & unsafe +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: safe = 15018 +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: unsafe = 384 +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: inf = 283 +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: total = 15685 +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: simplify +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: simplify_step3_sorted2 #Unsafe 121 #Pinned 0 #Safe 0 minCost 0.00148816 maxCost 1.13634 locations 15686 +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: new candidates = 9 +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: select ranges +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: Total: 15685 +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: Spilled: 0.000 (0) +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: Allocated: 1.000 (15685) +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: Rover zone: 0.954 (14961) +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: Pre-rover zone: 0.032 (497) +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: Post-rover zone: 0.012 (193) +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: Slice zone: 0.002 (34) +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: Blocks nothing: 0.031 (489) +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: Blocks medium: 0.005 (84) +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: Visited until medium blocking (mean): 0.624 +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: Visited until medium blocking (median): 0.712 +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: Visited until medium blocking (p95): 0.900 +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: Blocks tall: 0.963 (15112) +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: Visited until tall blocking (mean): 0.894 +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-08-07T13:55:52Z INFO 49724 [SB_Allocator]: Success +2025-08-07T13:56:16Z INFO 49724 [SB_Allocator]: SB spills = 0 tensors +2025-08-07T13:56:16Z INFO 49724 [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:56:16Z INFO 49724 [SB_Allocator]: remats = 0 tensors +2025-08-07T13:56:16Z INFO 49724 [SB_Allocator]: unpinned = 0 tensors +2025-08-07T13:56:16Z INFO 49724 [SB_Allocator]: size = 0 bytes/partition +2025-08-07T13:56:16Z INFO 49724 [SB_Allocator]: SB score = 0 +2025-08-07T13:56:16Z INFO 49724 [SB_Allocator]: spilling from SB cost about 0 cycles +2025-08-07T13:56:16Z INFO 49724 [SB_Allocator]: 16384 bytes/partition (100%) successfully pinned +2025-08-07T13:56:16Z INFO 49724 [SB_Allocator]: pinning saved approximately 9010 cycles +2025-08-07T13:56:16Z INFO 49724 [SB_Allocator]: 0% SB utilization after allocation +2025-08-07T13:56:16Z INFO 49724 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 7649430086 +2025-08-07T13:56:16Z INFO 49724 [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 7376 bytes +2025-08-07T13:56:16Z INFO 49724 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 2812938 +2025-08-07T13:56:16Z INFO 49724 [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 397 bytes +2025-08-07T13:56:16Z INFO 49724 [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 78980 +2025-08-07T13:56:16Z INFO 49724 [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 136 bytes +2025-08-07T13:56:16Z USER 49724 [ModuleForkPass]: coloring_allocator_sb finished after 25.660 seconds +2025-08-07T13:56:16Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1702mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:16Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28640 memory location(s), 1 block(s), and 281399 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:16Z USER 49724 [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:56:16Z INFO 49724 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=28640 blocks=1 instructions=281399 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:17Z INFO 49724 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:56:17Z USER 49724 [ModuleForkPass]: address_rotation_sb finished after 0.409 seconds +2025-08-07T13:56:17Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1705mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:17Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28640 memory location(s), 1 block(s), and 281399 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:17Z USER 49724 [ModuleForkPass]: Running dma_optimization_sb +2025-08-07T13:56:17Z INFO 49724 [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=28640 blocks=1 instructions=281399 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:17Z INFO 49724 [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 7652243024, 99.9264% input load, 5.22723e-08% output write, 0.0735995% spill/reload [sg0000] +2025-08-07T13:56:17Z INFO 49724 [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:56:17Z INFO 49724 [DMAOptimizationBase]: removed 0 identical load +2025-08-07T13:56:17Z INFO 49724 [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:56:17Z INFO 49724 [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-08-07T13:56:17Z INFO 49724 [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-08-07T13:56:17Z INFO 49724 [DMAOptimizationBase]: [IO to internal DMACopy Insertion]: inserted 0 DMACopy instructions +2025-08-07T13:56:17Z INFO 49724 [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-08-07T13:56:17Z INFO 49724 [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-08-07T13:56:17Z INFO 49724 [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 144, 1.8818e-06% out of total dma traffic(7.64661e+09) +2025-08-07T13:56:17Z INFO 49724 [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload instructions +2025-08-07T13:56:17Z INFO 49724 [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload memory locations +2025-08-07T13:56:18Z INFO 49724 [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-08-07T13:56:18Z INFO 49724 [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-08-07T13:56:18Z INFO 49724 [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 4100, 0.0727981% out of total spill/reload dma traffic +2025-08-07T13:56:18Z INFO 49724 [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-08-07T13:56:18Z INFO 49724 [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-08-07T13:56:18Z INFO 49724 [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:56:18Z INFO 49724 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-08-07T13:56:18Z INFO 49724 [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-08-07T13:56:18Z INFO 49724 [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:56:18Z INFO 49724 [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-08-07T13:56:18Z INFO 49724 [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-08-07T13:56:18Z INFO 49724 [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-08-07T13:56:18Z INFO 49724 [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-08-07T13:56:18Z INFO 49724 [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 116 SpillSaves and Reloads +2025-08-07T13:56:18Z INFO 49724 [DMAOptimizationBase]: average loaded DMA size 7389 bytes +2025-08-07T13:56:18Z INFO 49724 [DMAOptimizationBase]: average saved DMA size 539 bytes +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 56 SpillSaves and Reloads +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: average loaded DMA size 7396 bytes +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: average saved DMA size 651 bytes +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: DMA SpillSave Coalescing Round 2 combined 0 SpillSaves and Reloads +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: average loaded DMA size 7396 bytes +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: average saved DMA size 651 bytes +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 7649427892 +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 7396 bytes +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 2810888 +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 651 bytes +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 4244, 5.54609e-05% out of total dma traffic +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 7652238780, 99.9265% input load, 5.22723e-08% output write, 0.073546% spill/reload [sg0000] +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 7649427892 +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 7396 bytes +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 2810888 +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 651 bytes +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 78980 +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 136 bytes +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 7363 bytes +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-08-07T13:56:19Z USER 49724 [ModuleForkPass]: dma_optimization_sb finished after 2.312 seconds +2025-08-07T13:56:19Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1737mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:19Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281271 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:19Z USER 49724 [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:56:19Z INFO 49724 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=28467 blocks=1 instructions=281271 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:19Z INFO 49724 [DMAOptimizationBase]: SB Rotation rotated 358 Sb address +2025-08-07T13:56:20Z INFO 49724 [DMAOptimizationBase]: SB Rotation rotated 4271 Sb address +2025-08-07T13:56:20Z INFO 49724 [DMAOptimizationBase]: SB Rotation rotated 855 Sb address +2025-08-07T13:56:20Z INFO 49724 [DMAOptimizationBase]: SB Rotation rotated 436 Sb address +2025-08-07T13:56:21Z INFO 49724 [DMAOptimizationBase]: SB Rotation rotated 2028 Sb address +2025-08-07T13:56:21Z INFO 49724 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:56:21Z USER 49724 [ModuleForkPass]: address_rotation_sb finished after 1.743 seconds +2025-08-07T13:56:21Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1737mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:21Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281271 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:21Z USER 49724 [ModuleForkPass]: Running coloring_allocator_dram +2025-08-07T13:56:21Z INFO 49724 [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=28467 blocks=1 instructions=281271 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:21Z INFO 49724 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:56:21Z INFO 49724 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: reserved space = 8344453408 bytes +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: spill space = 3421188 bytes +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: aligned spill space = 3469312 bytes +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: dram space = 107374182400 bytes +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: renumber locations +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: size = 178 +2025-08-07T13:56:21Z INFO 49724 []: find first defs for local +2025-08-07T13:56:21Z INFO 49724 []: find first defs for global +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: Num intervals 178 Num locations 178 +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: IntervalTree Build Done +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: info.neighbors init Done +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: IntervalTree readback Done +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: simplify interference graph +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: initialize low and high +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: lo = 178 +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: hi = 0 +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: total = 178 +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: simplify +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: new candidates = 0 +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: select ranges +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: CC buffer size limit 524288000 +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: allreduce_dram_hwm 1208320 +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: Real CC buffer size 1208320 +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: DRAM hwm after allocation: 3117056 +2025-08-07T13:56:21Z INFO 49724 [DRAM_Allocator]: DRAM allocation successful +2025-08-07T13:56:21Z USER 49724 [ModuleForkPass]: coloring_allocator_dram finished after 0.380 seconds +2025-08-07T13:56:21Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1738mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:21Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281271 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:21Z USER 49724 [ModuleForkPass]: Running address_rotation_dram +2025-08-07T13:56:21Z INFO 49724 [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=28467 blocks=1 instructions=281271 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:21Z INFO 49724 [DMAOptimizationBase]: Runtime page size at 512MB +2025-08-07T13:56:21Z INFO 49724 [DMAOptimizationBase]: DRAM hwm before rotation 3117056 +2025-08-07T13:56:21Z INFO 49724 [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-08-07T13:56:21Z INFO 49724 [DMAOptimizationBase]: allreduce hwm 1208320 +2025-08-07T13:56:21Z INFO 49724 [DMAOptimizationBase]: Real CC buffer size 1208320 +2025-08-07T13:56:22Z INFO 49724 [DMAOptimizationBase]: DRAM hwm after rotation 3117056 +2025-08-07T13:56:22Z INFO 49724 [DMAOptimizationBase]: DRAM Rotation rotated 9 Dram address +2025-08-07T13:56:22Z USER 49724 [ModuleForkPass]: address_rotation_dram finished after 0.196 seconds +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1740mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281271 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:22Z USER 49724 [ModuleForkPass]: Running tensorcopy_accel +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=28467 blocks=1 instructions=281271 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:22Z INFO 49724 [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-08-07T13:56:22Z INFO 49724 [TensorCopyAccel::Impl]: Accelerated 72 out of 12153 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-08-07T13:56:22Z USER 49724 [ModuleForkPass]: tensorcopy_accel finished after 0.025 seconds +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1740mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281271 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:22Z USER 49724 [ModuleForkPass]: Running peephole_opts +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=28467 blocks=1 instructions=281271 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:22Z INFO 49724 [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-08-07T13:56:22Z USER 49724 [ModuleForkPass]: peephole_opts finished after 0.107 seconds +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1740mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281309 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:22Z USER 49724 [ModuleForkPass]: Running lower_kernel +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=28467 blocks=1 instructions=281309 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:22Z INFO 49724 [LowerKernel]: Started running LowerKernel +2025-08-07T13:56:22Z INFO 49724 [LowerKernel]: Start of kernel lowering pass, number of insts: 281309, number of allocs: 28467 +2025-08-07T13:56:22Z INFO 49724 [LowerKernel]: Scan BKs time (s): 0.022303 +2025-08-07T13:56:22Z INFO 49724 [LowerKernel]: Lower BKs time (s): 1.1e-05 +2025-08-07T13:56:22Z USER 49724 [ModuleForkPass]: lower_kernel finished after 0.026 seconds +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1740mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281309 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:22Z USER 49724 [ModuleForkPass]: Running lower_nki_kernel +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=28467 blocks=1 instructions=281309 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:22Z USER 49724 [ModuleForkPass]: lower_nki_kernel finished after 0.025 seconds +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1740mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281309 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:22Z USER 49724 [ModuleForkPass]: Running dynamic_dma_cleanup +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=28467 blocks=1 instructions=281309 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:22Z USER 49724 [ModuleForkPass]: dynamic_dma_cleanup finished after 0.081 seconds +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1742mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281309 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:22Z USER 49724 [ModuleForkPass]: Running birverifier +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=28467 blocks=1 instructions=281309 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:22Z USER 49724 [ModuleForkPass]: birverifier finished after 0.224 seconds +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1742mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281309 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:22Z USER 49724 [ModuleForkPass]: Running dynamic_dma_scan +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=28467 blocks=1 instructions=281309 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:22Z USER 49724 [ModuleForkPass]: dynamic_dma_scan finished after 0.032 seconds +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1742mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281309 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:22Z USER 49724 [ModuleForkPass]: Running build_fdeps +2025-08-07T13:56:22Z INFO 49724 [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=28467 blocks=1 instructions=281309 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:22Z INFO 49724 [build_flow_deps]: Start build fdeps. Invocation: 2Thu Aug 7 13:56:22 2025 +2025-08-07T13:56:22Z INFO 49724 [build_flow_deps]: Allocs: 28467 instructions: 281309 +2025-08-07T13:56:23Z INFO 49724 [build_flow_deps]: Build fdeps inserted 828112 edges +2025-08-07T13:56:23Z INFO 49724 [build_flow_deps]: Done build fdeps 828112 Thu Aug 7 13:56:23 2025 +2025-08-07T13:56:23Z USER 49724 [ModuleForkPass]: build_fdeps finished after 0.644 seconds +2025-08-07T13:56:23Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1751mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:23Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281309 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:23Z USER 49724 [ModuleForkPass]: Running remove_redundancies +2025-08-07T13:56:23Z INFO 49724 [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=28467 blocks=1 instructions=281309 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:23Z INFO 49724 [RemoveRedundancies]: remove_clobbered_writes +2025-08-07T13:56:23Z INFO 49724 [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-08-07T13:56:23Z INFO 49724 [RemoveRedundancies]: remove_useless_insts +2025-08-07T13:56:23Z INFO 49724 [RemoveRedundancies]: remove Useless Instructions: 0 +2025-08-07T13:56:23Z USER 49724 [ModuleForkPass]: remove_redundancies finished after 0.117 seconds +2025-08-07T13:56:23Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1751mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:23Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281309 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:23Z USER 49724 [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:56:23Z INFO 49724 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=28467 blocks=1 instructions=281309 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:23Z INFO 49724 [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:56:23Z INFO 49724 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:56:23Z INFO 49724 [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:56:24Z USER 49724 [ModuleForkPass]: anti_dependency_analyzer finished after 1.313 seconds +2025-08-07T13:56:24Z INFO 49724 [ModuleForkPass]: curr_vmrss: 2232mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:24Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281309 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:24Z USER 49724 [ModuleForkPass]: Running tensor_copy_elim +2025-08-07T13:56:24Z INFO 49724 [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=28467 blocks=1 instructions=281309 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:24Z INFO 49724 [TensorCopyElim]: Tensor CP elimination: 0 +2025-08-07T13:56:24Z INFO 49724 [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:56:24Z USER 49724 [ModuleForkPass]: tensor_copy_elim finished after 0.274 seconds +2025-08-07T13:56:24Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1850mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:24Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281309 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:24Z USER 49724 [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-08-07T13:56:24Z INFO 49724 [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=28467 blocks=1 instructions=281309 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:24Z USER 49724 [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.001 seconds +2025-08-07T13:56:24Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1850mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:24Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281309 instruction(s). Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:24Z USER 49724 [ModuleForkPass]: Running post_sched +2025-08-07T13:56:24Z INFO 49724 [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=28467 blocks=1 instructions=281309 Max writers: 1536 Max Readers: 21259 +2025-08-07T13:56:24Z INFO 49724 [post_scheduler]: Start PosT ScheD 3 sunda Thu Aug 7 13:56:24 2025 +2025-08-07T13:56:24Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.336-t41353 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.383-t41364 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.668-t41391 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.715-t41402 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.1000-t41429 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.1047-t41440 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.1332-t41467 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.1379-t41478 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.1664-t41505 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.1711-t41516 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.1996-t41543 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.2043-t41554 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.2328-t41581 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.2375-t41592 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.2660-t41619 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.2707-t41630 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.2992-t41657 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.3039-t41668 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.3324-t41695 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.3371-t41706 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.3656-t41733 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.3703-t41744 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.3988-t41771 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.4035-t41782 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.4320-t41809 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.4367-t41820 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.4652-t41847 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.4699-t41858 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.4984-t41885 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.5031-t41896 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.5316-t41923 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.5363-t41934 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.5648-t41961 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.5695-t41972 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.5980-t41999 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.6027-t42010 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.6312-t42037 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.6359-t42048 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.6644-t42075 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.6691-t42086 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.6976-t42113 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.7023-t42124 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.7308-t42151 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.7355-t42162 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.7640-t42189 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.7687-t42200 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.7972-t42227 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.8019-t42238 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.8304-t42265 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.8351-t42276 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.8636-t42303 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.8683-t42314 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.8968-t42341 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.9015-t42352 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.9300-t42379 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.9347-t42390 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.9632-t42417 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.9679-t42428 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.9964-t42455 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.10011-t42466 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.10296-t42493 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.10343-t42504 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.10628-t42531 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.10675-t42542 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.10960-t42569 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.11007-t42580 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.11292-t42607 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.11339-t42618 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.11624-t42645 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.11671-t42656 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.11956-t42683 +2025-08-07T13:56:25Z WARNING 49724 [post_scheduler]: Inserted memset 0 for _dot.12003-t42694 +2025-08-07T13:56:37Z INFO 49724 [post_scheduler]: Time-aware hwm post-sched +2025-08-07T13:56:43Z INFO 49724 [post_scheduler]: Time-aware simulation time: 35022522 +2025-08-07T13:56:44Z INFO 49724 [post_scheduler]: Done PosT ScheD Thu Aug 7 13:56:44 2025 +2025-08-07T13:56:44Z USER 49724 [ModuleForkPass]: post_sched finished after 19.161 seconds +2025-08-07T13:56:44Z INFO 49724 [ModuleForkPass]: curr_vmrss: 2301mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:44Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:44Z USER 49724 [ModuleForkPass]: Running expand_scheduling_units +2025-08-07T13:56:44Z INFO 49724 [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:44Z USER 49724 [ModuleForkPass]: expand_scheduling_units finished after 0.052 seconds +2025-08-07T13:56:44Z INFO 49724 [ModuleForkPass]: curr_vmrss: 2184mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:44Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:44Z USER 49724 [ModuleForkPass]: Running address_rotation_sb +2025-08-07T13:56:44Z INFO 49724 [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:47Z INFO 49724 [DMAOptimizationBase]: PSUM Rotation rotated 6534 PSUM Banks +2025-08-07T13:56:47Z INFO 49724 [DMAOptimizationBase]: PSUM Rotation rotated 7081 PSUM Banks +2025-08-07T13:56:48Z INFO 49724 [DMAOptimizationBase]: PSUM Rotation rotated 345 PSUM Banks +2025-08-07T13:56:48Z INFO 49724 [DMAOptimizationBase]: SB Rotation rotated 506 Sb address +2025-08-07T13:56:49Z INFO 49724 [DMAOptimizationBase]: SB Rotation rotated 4948 Sb address +2025-08-07T13:56:49Z INFO 49724 [DMAOptimizationBase]: SB Rotation rotated 395 Sb address +2025-08-07T13:56:49Z INFO 49724 [DMAOptimizationBase]: SB Rotation rotated 519 Sb address +2025-08-07T13:56:50Z INFO 49724 [DMAOptimizationBase]: SB Rotation rotated 473 Sb address +2025-08-07T13:56:50Z INFO 49724 [DMAOptimizationBase]: moved 0 MM forward +2025-08-07T13:56:50Z INFO 49724 [DMAOptimizationBase]: SB Rotation rotated 6 Sb address +2025-08-07T13:56:50Z INFO 49724 [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-08-07T13:56:50Z USER 49724 [ModuleForkPass]: address_rotation_sb finished after 6.650 seconds +2025-08-07T13:56:50Z INFO 49724 [ModuleForkPass]: curr_vmrss: 2206mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:50Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:50Z USER 49724 [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:56:50Z INFO 49724 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:50Z INFO 49724 [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:56:50Z INFO 49724 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-08-07T13:56:50Z INFO 49724 [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:56:52Z USER 49724 [ModuleForkPass]: anti_dependency_analyzer finished after 1.432 seconds +2025-08-07T13:56:52Z INFO 49724 [ModuleForkPass]: curr_vmrss: 2390mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:52Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:52Z USER 49724 [ModuleForkPass]: Running anti_dependency_analyzer +2025-08-07T13:56:52Z INFO 49724 [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:52Z INFO 49724 [AntiDependencyAnalyzer]: Batch size: 1000 +2025-08-07T13:56:52Z INFO 49724 [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-08-07T13:56:52Z INFO 49724 [AntiDependencyAnalyzer]: DRAM size: 17179869184 num-bins: 16 bin-size: 1073741824 +2025-08-07T13:56:52Z USER 49724 [ModuleForkPass]: anti_dependency_analyzer finished after 0.266 seconds +2025-08-07T13:56:52Z INFO 49724 [ModuleForkPass]: curr_vmrss: 1996mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:52Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:52Z USER 49724 [ModuleForkPass]: Running dep_opt +2025-08-07T13:56:52Z INFO 49724 [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:52Z INFO 49724 [build_flow_deps]: Start build fdeps. Invocation: 3Thu Aug 7 13:56:52 2025 +2025-08-07T13:56:52Z INFO 49724 [build_flow_deps]: Allocs: 28467 instructions: 281381 +2025-08-07T13:56:53Z INFO 49724 [build_flow_deps]: Build fdeps inserted 818023 edges +2025-08-07T13:56:53Z INFO 49724 [build_flow_deps]: Done build fdeps 818023 Thu Aug 7 13:56:53 2025 +2025-08-07T13:56:53Z USER 49724 [ModuleForkPass]: dep_opt finished after 1.320 seconds +2025-08-07T13:56:53Z INFO 49724 [ModuleForkPass]: curr_vmrss: 2028mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:53Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:53Z USER 49724 [ModuleForkPass]: Running report_stats +2025-08-07T13:56:53Z INFO 49724 [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:53Z INFO 49724 [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 622329856 │ +│ DMACopy │ Internal │ 1 │ 24576 │ +│ DMACopy │ Internal -> ExternalOutput │ 72 │ 75497472 │ +│ Load │ Const -> Internal │ 78 │ 2395400 │ +│ Load │ ExternalInput -> Internal │ 8053 │ 7644215460 │ +│ Load │ Internal │ 107 │ 2817032 │ +│ Save │ Internal │ 695 │ 2810884 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴────────────┘ + +2025-08-07T13:56:53Z INFO 49724 [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 72 │ +│ 4 │ 50 │ +│ 8 │ 5 │ +│ 16 │ 3 │ +│ 64 │ 73 │ +│ 256 │ 147 │ +│ 512 │ 666 │ +│ 1024 │ 16 │ +│ 2048 │ 30 │ +│ 4096 │ 2 │ +│ 6144 │ 2304 │ +│ 8192 │ 5565 │ +│ 60768 │ 1 │ +│ 60776 │ 4 │ +│ 262144 │ 72 │ +└─────────────────────┴───────┘ + +2025-08-07T13:56:54Z INFO 49724 [ReportStats]: MM Stats: #MatMults 255192 #MatMult-Transposes 21263 +2025-08-07T13:56:54Z INFO 49724 [ReportStats]: IO Tensor size combined: 8342042648 +2025-08-07T13:56:54Z INFO 49724 [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input473 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input76 │ ExternalInput │ bfloat16 │ 622329856 │ +│ input85 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input106 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input96 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input84 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input98 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input109 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input107 │ ExternalInput │ bfloat16 │ 50331648 │ +│ input95 │ ExternalInput │ bfloat16 │ 50331648 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-08-07T13:56:54Z INFO 49724 [ReportStats]: Large (Internal) Tensor Statistics: +┌──────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├──────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input17_local_39020 │ Internal │ bfloat16 │ 1048576 │ +│ input11_local_38792 │ Internal │ bfloat16 │ 1048576 │ +│ input9_local_38716 │ Internal │ bfloat16 │ 1048576 │ +│ input13_local_38868 │ Internal │ bfloat16 │ 1048576 │ +│ input15_local_38944 │ Internal │ bfloat16 │ 1048576 │ +│ input7_local_38640 │ Internal │ bfloat16 │ 1048576 │ +│ input23_local_39248 │ Internal │ bfloat16 │ 1048576 │ +│ input21_local_39172 │ Internal │ bfloat16 │ 1048576 │ +│ input19_local_39096 │ Internal │ bfloat16 │ 1048576 │ +└──────────────────────┴──────────┴──────────┴──────────────┘ + +2025-08-07T13:56:54Z USER 49724 [ModuleForkPass]: report_stats finished after 0.080 seconds +2025-08-07T13:56:54Z INFO 49724 [ModuleForkPass]: curr_vmrss: 2028mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:54Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [BackendPassManager]: mod_parallel_pass finished after 73.221 seconds +2025-08-07T13:56:54Z INFO 49724 [BackendPassManager]: curr_vmrss: 2028mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:54Z INFO 49724 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [BackendPassManager]: Running assign_trigger_engine +2025-08-07T13:56:54Z INFO 49724 [BackendPassManager]: Inputs to assign_trigger_engine: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z INFO 49724 [AssignTriggerEngine]: Assigned trigger engine for 771 DMA instructions. Moved 76 DMA instructions to CC's engines. +2025-08-07T13:56:54Z USER 49724 [BackendPassManager]: assign_trigger_engine finished after 0.133 seconds +2025-08-07T13:56:54Z INFO 49724 [BackendPassManager]: curr_vmrss: 2028mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:54Z INFO 49724 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:56:54Z INFO 49724 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [SubgraphForkPass]: Running lower_local_collectives +2025-08-07T13:56:54Z INFO 49724 [SubgraphForkPass]: Inputs to lower_local_collectives: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [SubgraphForkPass]: lower_local_collectives finished after 0.002 seconds +2025-08-07T13:56:54Z INFO 49724 [SubgraphForkPass]: curr_vmrss: 2028mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:54Z INFO 49724 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [SubgraphForkPass]: Running extend_shared_lifetimes +2025-08-07T13:56:54Z INFO 49724 [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [SubgraphForkPass]: extend_shared_lifetimes finished after 0.002 seconds +2025-08-07T13:56:54Z INFO 49724 [SubgraphForkPass]: curr_vmrss: 2028mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:54Z INFO 49724 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [SubgraphForkPass]: Running dead_code_elim +2025-08-07T13:56:54Z INFO 49724 [SubgraphForkPass]: Inputs to dead_code_elim: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z INFO 49724 [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-08-07T13:56:54Z USER 49724 [SubgraphForkPass]: dead_code_elim finished after 0.215 seconds +2025-08-07T13:56:54Z INFO 49724 [SubgraphForkPass]: curr_vmrss: 2028mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:54Z INFO 49724 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [BackendPassManager]: subgraph_parallel_pass finished after 0.225 seconds +2025-08-07T13:56:54Z INFO 49724 [BackendPassManager]: curr_vmrss: 2028mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:54Z INFO 49724 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [BackendPassManager]: Running assign_hwdge_engine +2025-08-07T13:56:54Z INFO 49724 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [BackendPassManager]: assign_hwdge_engine finished after 0.034 seconds +2025-08-07T13:56:54Z INFO 49724 [BackendPassManager]: curr_vmrss: 2028mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:54Z INFO 49724 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:56:54Z INFO 49724 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [ModuleForkPass]: Running alloc_queues +2025-08-07T13:56:54Z INFO 49724 [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z INFO 49724 [AllocQueues]: DMACopy transpose will be triggered from multiple engines +2025-08-07T13:56:54Z INFO 49724 [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 42 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 110 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 95 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 671 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 5 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 8084 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-08-07T13:56:54Z USER 49724 [ModuleForkPass]: alloc_queues finished after 0.035 seconds +2025-08-07T13:56:54Z INFO 49724 [ModuleForkPass]: curr_vmrss: 2028mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:54Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [ModuleForkPass]: Running chain_dma_transposes +2025-08-07T13:56:54Z INFO 49724 [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [ModuleForkPass]: chain_dma_transposes finished after 0.001 seconds +2025-08-07T13:56:54Z INFO 49724 [ModuleForkPass]: curr_vmrss: 2028mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:54Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-08-07T13:56:54Z INFO 49724 [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.001 seconds +2025-08-07T13:56:54Z INFO 49724 [ModuleForkPass]: curr_vmrss: 2028mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:54Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [ModuleForkPass]: Running lower_control +2025-08-07T13:56:54Z INFO 49724 [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z INFO 49724 [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-08-07T13:56:54Z USER 49724 [ModuleForkPass]: lower_control finished after 0.414 seconds +2025-08-07T13:56:54Z INFO 49724 [ModuleForkPass]: curr_vmrss: 2028mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:54Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [BackendPassManager]: mod_parallel_pass finished after 0.461 seconds +2025-08-07T13:56:54Z INFO 49724 [BackendPassManager]: curr_vmrss: 2028mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:54Z INFO 49724 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [BackendPassManager]: Running nc_parallel_pass +2025-08-07T13:56:54Z INFO 49724 [BackendPassManager]: Inputs to nc_parallel_pass: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z USER 49724 [CoreForkPass]: Running dep_reduction +2025-08-07T13:56:54Z INFO 49724 [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:54Z INFO 49724 [DepReduction]: Start Dependency Reduction +2025-08-07T13:56:55Z INFO 49724 [DepReduction]: Processing async instrs... +2025-08-07T13:56:55Z INFO 49724 [DepReduction]: Processing secondary edges per engine... +2025-08-07T13:56:55Z INFO 49724 [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 255142 +2025-08-07T13:56:55Z INFO 49724 [DepReduction]: Processing redundant descendants, Done. Num edges removed 264174 +2025-08-07T13:56:55Z INFO 49724 [DepReduction]: Processing async instrs, Done. Num edges removed 264174 +2025-08-07T13:56:58Z INFO 49724 [DepReduction]: Num Async removed: 0 +2025-08-07T13:56:58Z INFO 49724 [DepReduction]: Finished dependency reduction: 1778145 removed, new total 39392 +2025-08-07T13:56:58Z INFO 49724 [DepReduction]: Finished Dependency Reduction +2025-08-07T13:56:58Z USER 49724 [CoreForkPass]: dep_reduction finished after 3.595 seconds +2025-08-07T13:56:58Z INFO 49724 [CoreForkPass]: curr_vmrss: 2254mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:58Z INFO 49724 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:58Z USER 49724 [CoreForkPass]: Running lower_dynamic_dma +2025-08-07T13:56:58Z INFO 49724 [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:58Z USER 49724 [CoreForkPass]: lower_dynamic_dma finished after 0.068 seconds +2025-08-07T13:56:58Z INFO 49724 [CoreForkPass]: curr_vmrss: 2247mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:58Z INFO 49724 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:58Z USER 49724 [CoreForkPass]: Running legalize_dynamic_dma +2025-08-07T13:56:58Z INFO 49724 [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:58Z INFO 49724 [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-08-07T13:56:58Z INFO 49724 [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-08-07T13:56:58Z INFO 49724 [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-08-07T13:56:58Z USER 49724 [CoreForkPass]: legalize_dynamic_dma finished after 0.129 seconds +2025-08-07T13:56:58Z INFO 49724 [CoreForkPass]: curr_vmrss: 2247mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:58Z INFO 49724 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281381 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:58Z USER 49724 [CoreForkPass]: Running lower_dma +2025-08-07T13:56:58Z INFO 49724 [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=28467 blocks=1 instructions=281381 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:58Z INFO 49724 [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 7938/7938 (100% DGE) + power-of-2 partition : 7939/7982 (99.4613% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 7939/7982 (99.4613% DGE) + Cast (DGE/DMA) + 128 partition : 72/72 (100% DGE) + power-of-2 partition : 72/72 (100% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 72/72 (100% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 0/9 (0% DGE) + power-of-2 partition : 0/880 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/880 (0% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 1 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 72/72 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-08-07T13:56:58Z USER 49724 [CoreForkPass]: lower_dma finished after 0.236 seconds +2025-08-07T13:56:58Z INFO 49724 [CoreForkPass]: curr_vmrss: 2247mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:58Z INFO 49724 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281383 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:58Z USER 49724 [CoreForkPass]: Running coalesce_dma_blocks +2025-08-07T13:56:58Z INFO 49724 [CoreForkPass]: Inputs to coalesce_dma_blocks: modules=1 functions=1 allocs=28467 blocks=1 instructions=281383 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:59Z INFO 49724 [CoalesceDmaBlocks]: Coaleseced 53 DMA triggers +2025-08-07T13:56:59Z USER 49724 [CoreForkPass]: coalesce_dma_blocks finished after 0.128 seconds +2025-08-07T13:56:59Z INFO 49724 [CoreForkPass]: curr_vmrss: 2251mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:59Z INFO 49724 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281330 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:59Z USER 49724 [CoreForkPass]: Running expand_all_engine +2025-08-07T13:56:59Z INFO 49724 [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=28467 blocks=1 instructions=281330 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:59Z USER 49724 [CoreForkPass]: expand_all_engine finished after 0.048 seconds +2025-08-07T13:56:59Z INFO 49724 [CoreForkPass]: curr_vmrss: 2246mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:59Z INFO 49724 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281330 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:59Z USER 49724 [CoreForkPass]: Running alloc_semaphores +2025-08-07T13:56:59Z INFO 49724 [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=28467 blocks=1 instructions=281330 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:59Z USER 49724 [CoreForkPass]: alloc_semaphores finished after 0.392 seconds +2025-08-07T13:56:59Z INFO 49724 [CoreForkPass]: curr_vmrss: 2246mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:59Z INFO 49724 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281330 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:59Z USER 49724 [CoreForkPass]: Running expand_inst_late +2025-08-07T13:56:59Z INFO 49724 [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=28467 blocks=1 instructions=281330 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:59Z USER 49724 [CoreForkPass]: expand_inst_late finished after 0.446 seconds +2025-08-07T13:56:59Z INFO 49724 [CoreForkPass]: curr_vmrss: 2246mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:56:59Z INFO 49724 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281405 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:56:59Z USER 49724 [CoreForkPass]: Running seq_inst_opt +2025-08-07T13:56:59Z INFO 49724 [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=28467 blocks=1 instructions=281405 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:00Z INFO 49724 [SeqInstOpt]: Removing 71 unnecessary InstRegisterMove instruction(s) from Block1 +2025-08-07T13:57:00Z USER 49724 [CoreForkPass]: seq_inst_opt finished after 0.036 seconds +2025-08-07T13:57:00Z INFO 49724 [CoreForkPass]: curr_vmrss: 2246mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:57:00Z INFO 49724 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 281334 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:00Z USER 49724 [CoreForkPass]: Running lower_sync +2025-08-07T13:57:00Z INFO 49724 [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=28467 blocks=1 instructions=281334 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:00Z USER 49724 [CoreForkPass]: lower_sync finished after 0.101 seconds +2025-08-07T13:57:00Z INFO 49724 [CoreForkPass]: curr_vmrss: 2253mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:57:00Z INFO 49724 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 290050 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:00Z USER 49724 [CoreForkPass]: Running lower_act +2025-08-07T13:57:00Z INFO 49724 [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=28467 blocks=1 instructions=290050 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:00Z USER 49724 [CoreForkPass]: lower_act finished after 0.059 seconds +2025-08-07T13:57:00Z INFO 49724 [CoreForkPass]: curr_vmrss: 2254mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:57:00Z INFO 49724 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 290308 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:00Z USER 49724 [CoreForkPass]: Running lower_dve +2025-08-07T13:57:00Z INFO 49724 [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=28467 blocks=1 instructions=290308 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:00Z INFO 49724 [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen2/dve_info.json +2025-08-07T13:57:00Z USER 49724 [CoreForkPass]: lower_dve finished after 0.373 seconds +2025-08-07T13:57:00Z INFO 49724 [CoreForkPass]: curr_vmrss: 2298mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:57:00Z INFO 49724 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 290308 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:00Z USER 49724 [CoreForkPass]: Running lower_ap +2025-08-07T13:57:00Z INFO 49724 [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=28467 blocks=1 instructions=290308 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:00Z USER 49724 [CoreForkPass]: lower_ap finished after 0.056 seconds +2025-08-07T13:57:00Z INFO 49724 [CoreForkPass]: curr_vmrss: 2255mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:57:00Z INFO 49724 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 290308 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:00Z USER 49724 [CoreForkPass]: Running coloring_allocator_reg +2025-08-07T13:57:00Z INFO 49724 [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=28467 blocks=1 instructions=290308 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:00Z INFO 49724 [ColoringAllocator::Rep]: Allocating functions +2025-08-07T13:57:00Z INFO 49724 [ColoringAllocator::Rep]: linearize and check +2025-08-07T13:57:00Z INFO 49724 [REG_Allocator]: allocating REG +2025-08-07T13:57:00Z INFO 49724 [REG_Allocator]: main loop iteration 1 +2025-08-07T13:57:00Z INFO 49724 [REG_Allocator]: renumber registers +2025-08-07T13:57:00Z INFO 49724 [REG_Allocator]: size = 5 +2025-08-07T13:57:00Z INFO 49724 []: find first defs for local reg +2025-08-07T13:57:00Z INFO 49724 []: find first defs for global reg +2025-08-07T13:57:00Z INFO 49724 [REG_Allocator]: live range analysis +2025-08-07T13:57:01Z INFO 49724 [REG_Allocator]: find costs +2025-08-07T13:57:01Z INFO 49724 [REG_Allocator]: simplify interference graph +2025-08-07T13:57:01Z INFO 49724 [REG_Allocator]: initialize low and high +2025-08-07T13:57:01Z INFO 49724 [REG_Allocator]: lo = 5 +2025-08-07T13:57:01Z INFO 49724 [REG_Allocator]: hi = 0 +2025-08-07T13:57:01Z INFO 49724 [REG_Allocator]: inf = 0 +2025-08-07T13:57:01Z INFO 49724 [REG_Allocator]: total = 5 +2025-08-07T13:57:01Z INFO 49724 [REG_Allocator]: simplify +2025-08-07T13:57:01Z INFO 49724 [REG_Allocator]: new candidates = 0 +2025-08-07T13:57:01Z INFO 49724 [REG_Allocator]: select ranges +2025-08-07T13:57:01Z INFO 49724 [REG_Allocator]: no more spills +2025-08-07T13:57:01Z INFO 49724 [REG_Allocator]: REG score = 0 (lower is better) +2025-08-07T13:57:01Z INFO 49724 [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-08-07T13:57:01Z INFO 49724 [REG_Allocator]: 0% REG utilization after allocation +2025-08-07T13:57:01Z USER 49724 [CoreForkPass]: coloring_allocator_reg finished after 0.486 seconds +2025-08-07T13:57:01Z INFO 49724 [CoreForkPass]: curr_vmrss: 2301mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:57:01Z INFO 49724 [CoreForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 290308 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:01Z USER 49724 [BackendPassManager]: nc_parallel_pass finished after 6.576 seconds +2025-08-07T13:57:01Z INFO 49724 [BackendPassManager]: curr_vmrss: 2255mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:57:01Z INFO 49724 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 290308 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:01Z USER 49724 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:57:01Z INFO 49724 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=28467 blocks=1 instructions=290308 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:01Z USER 49724 [ModuleForkPass]: Running birverifier +2025-08-07T13:57:01Z INFO 49724 [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=28467 blocks=1 instructions=290308 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:01Z USER 49724 [ModuleForkPass]: birverifier finished after 0.327 seconds +2025-08-07T13:57:01Z INFO 49724 [ModuleForkPass]: curr_vmrss: 2002mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:57:01Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 290308 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:01Z USER 49724 [BackendPassManager]: mod_parallel_pass finished after 0.333 seconds +2025-08-07T13:57:01Z INFO 49724 [BackendPassManager]: curr_vmrss: 2002mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:57:01Z INFO 49724 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 290308 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:01Z USER 49724 [BackendPassManager]: Running subgraph_parallel_pass +2025-08-07T13:57:01Z INFO 49724 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=1 functions=1 allocs=28467 blocks=1 instructions=290308 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:01Z USER 49724 [SubgraphForkPass]: Running lnc_verifier +2025-08-07T13:57:01Z INFO 49724 [SubgraphForkPass]: Inputs to lnc_verifier: modules=1 functions=1 allocs=28467 blocks=1 instructions=290308 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:01Z USER 49724 [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-08-07T13:57:01Z INFO 49724 [SubgraphForkPass]: curr_vmrss: 2002mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:57:01Z INFO 49724 [SubgraphForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 290308 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:01Z USER 49724 [BackendPassManager]: subgraph_parallel_pass finished after 0.004 seconds +2025-08-07T13:57:01Z INFO 49724 [BackendPassManager]: curr_vmrss: 2002mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:57:01Z INFO 49724 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 290308 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:01Z USER 49724 [BackendPassManager]: Running mod_parallel_pass +2025-08-07T13:57:01Z INFO 49724 [BackendPassManager]: Inputs to mod_parallel_pass: modules=1 functions=1 allocs=28467 blocks=1 instructions=290308 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:01Z USER 49724 [ModuleForkPass]: Running codegen +2025-08-07T13:57:01Z INFO 49724 [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=28467 blocks=1 instructions=290308 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:01Z INFO 49724 [Codegen]: Total compiler allocated DRAM tensors: 0.00290298 GB +2025-08-07T13:57:01Z INFO 49724 [Codegen]: Total un-allocated DRAM tensors by kind: +2025-08-07T13:57:01Z INFO 49724 [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 7.69882 │ +│ ExternalOutput │ 3.72529e-09 │ +│ Const │ 0.0022452 │ +└────────────────┴─────────────┘ + +2025-08-07T13:57:01Z INFO 49724 [Codegen]: Total runtime managed DRAM tensors: 7.70107 GB +2025-08-07T13:57:03Z INFO 49724 [Codegen]: Instruction Stats: +2025-08-07T13:57:03Z INFO 49724 [Codegen]: +┌─────────────────────┬────────┐ +│ Opcode │ Count │ +├─────────────────────┼────────┤ +│ MATMUL │ 255192 │ +│ LDWEIGHTS │ 254940 │ +│ ACTIVATE │ 12781 │ +│ EVENT_SEMAPHORE │ 8716 │ +│ UNKNOWN(0xd4) │ 8084 │ +│ TENSOR_TENSOR │ 1341 │ +│ PSEUDO_DMA_TRIGGER │ 871 │ +│ LOAD_MASK_SELECT │ 546 │ +│ STREAM_SHUFFLE │ 510 │ +│ MATCH_VALUE_LOAD │ 441 │ +│ MEMSET │ 370 │ +│ TENSOR_SCALAR_ADDR │ 345 │ +│ TENSOR_SCALAR │ 332 │ +│ ACT_TABLE_LOAD │ 258 │ +│ CAST │ 241 │ +│ FIND_INDEX8 │ 224 │ +│ MAX8 │ 224 │ +│ MATCH_REPLACE8 │ 217 │ +│ UNKNOWN(0xda) │ 148 │ +│ TENSOR_REDUCE │ 115 │ +│ GATHER │ 99 │ +│ POOL_BUFFER_LOAD │ 99 │ +│ UNKNOWN(0xd9) │ 75 │ +│ RECIPROCAL │ 75 │ +│ IOTA │ 73 │ +│ COPY │ 73 │ +│ UNKNOWN(0x8d) │ 72 │ +│ UNKNOWN(0xe8) │ 38 │ +│ STREAM_TRANSPOSE │ 36 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ ALU_OP │ 2 │ +│ UNKNOWN(0xe5) │ 2 │ +│ MOVE │ 1 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +│ NOP │ 1 │ +│ RNG │ 1 │ +│ TENSOR_SCALAR │ 1 │ +└─────────────────────┴────────┘ + +2025-08-07T13:57:03Z INFO 49724 [Codegen]: +┌────────────┬────────┐ +│ Engine │ Count │ +├────────────┼────────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 13236 │ +│ Scalar │ 14676 │ +│ Tensor │ 513299 │ +│ SyncDMA │ 0 │ +│ Vector │ 5185 │ +│ Sync │ 159 │ +│ All │ 0 │ +└────────────┴────────┘ + +2025-08-07T13:57:03Z INFO 49724 [Codegen]: Total instructions: 546555 (0.0325772 GB) +2025-08-07T13:57:03Z INFO 49724 [Codegen]: Total DynamicDMA instruction count: 8084 +2025-08-07T13:57:03Z USER 49724 [Codegen]: isa_gen finished after 1.380 seconds +2025-08-07T13:57:03Z INFO 49724 [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────┼────────────────┤ +│ qActSpillReload0 │ 5932 │ +│ qDVESpillReload0 │ 264 │ +│ qPoolIO0 │ 2 │ +│ qPoolSpillReload0 │ 7308 │ +│ qSPIO0 │ 98 │ +│ qSPSpillReload0 │ 12766 │ +└───────────────────┴────────────────┘ + +Total descriptors: 26370 (0.000392944 GB) +2025-08-07T13:57:03Z INFO 49724 [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolDynamic │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +│ qPoolIO0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 112 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-08-07T13:57:03Z INFO 49724 [Codegen]: Tensors with largest descriptor count: +┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ Coalesced_memloc_Coalesced_memloc_cosine.140.56214--cosine.140.56212_42--Coalesced_memloc_cosine.140.56204--cosine.140.56202_45_108 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.140.56274--cosine.140.56272_24--Coalesced_memloc_cosine.140.56264--cosine.140.56262_27_99 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.140.56334--cosine.140.56332_6--Coalesced_memloc_cosine.140.56324--cosine.140.56322_9_90 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.140.56234--cosine.140.56232_36--Coalesced_memloc_cosine.140.56224--cosine.140.56222_39_105 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.140.56354--cosine.140.56352_0--Coalesced_memloc_cosine.140.56344--cosine.140.56342_3_87 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.140.56194--cosine.140.56192_48--Coalesced_memloc_cosine.140.56184--cosine.140.56182_51_111 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.140.56154--cosine.140.56152_60--Coalesced_memloc_cosine.140.56144--cosine.140.56142_63_117 │ Internal │ float32 │ 5 │ +│ Coalesced_memloc_Coalesced_memloc_cosine.140.56294--cosine.140.56292_18--Coalesced_memloc_cosine.140.56284--cosine.140.56282_21_96 │ Internal │ float32 │ 5 │ +│ input2 │ ExternalInput │ int32 │ 37 │ +│ convert.840 │ Internal │ float32 │ 599 │ +└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-08-07T13:57:03Z USER 49724 [Codegen]: dma_desc_gen finished after 0.013 seconds +2025-08-07T13:57:03Z INFO 49724 [Codegen]: Estimated peak DRAM usage: 7.73694 GB +2025-08-07T13:57:03Z INFO 49724 [Codegen]: Generating debug info +2025-08-07T13:57:03Z WARNING 49724 [Codegen]: Found 163 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-08-07T13:57:03Z USER 49724 [Codegen]: debug_info_gen finished after 0.617 seconds +2025-08-07T13:57:03Z USER 49724 [ModuleForkPass]: codegen finished after 2.063 seconds +2025-08-07T13:57:03Z INFO 49724 [ModuleForkPass]: curr_vmrss: 2235mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:57:03Z INFO 49724 [ModuleForkPass]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 290308 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:03Z USER 49724 [BackendPassManager]: mod_parallel_pass finished after 2.089 seconds +2025-08-07T13:57:03Z INFO 49724 [BackendPassManager]: curr_vmrss: 2042mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:57:03Z INFO 49724 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 290308 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:03Z USER 49724 [BackendPassManager]: Running neff_packager +2025-08-07T13:57:03Z INFO 49724 [BackendPassManager]: Inputs to neff_packager: modules=1 functions=1 allocs=28467 blocks=1 instructions=290308 Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:04Z WARNING 49724 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.169490.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-08-07T13:57:04Z INFO 49724 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.neff +2025-08-07T13:57:04Z INFO 49724 [NeffFileWriter]: IR signature: 6aecd6c6c01d7580a238f045b22882dc for neff artifacts +2025-08-07T13:57:04Z USER 49724 [BackendPassManager]: neff_packager finished after 0.318 seconds +2025-08-07T13:57:04Z INFO 49724 [BackendPassManager]: curr_vmrss: 2042mb, ru_maxrss: 2417mb (delta=0mb) +2025-08-07T13:57:04Z INFO 49724 [BackendPassManager]: Output has 1 module(s), 1 function(s), 28467 memory location(s), 1 block(s), and 290308 instruction(s). Max writers: 1537 Max Readers: 21259 +2025-08-07T13:57:04Z INFO 49724 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ module │ Peak scratchpad usage: local │ 0.002903 GB │ +│ nc00 │ module │ Total size of allocated tensors: local │ 0.003231 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.002903 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.000000 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.000000 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.002903 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-08-07T13:57:04Z INFO 49724 [BackendDriver]: Backend completed successfully, tearing down. +2025-08-07T13:57:05Z INFO 48501 [job.WalrusDriver.0]: Job #0 finished +2025-08-07T13:57:05Z INFO 48501 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-08-07T13:57:05Z INFO 48501 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-08-07T13:57:05Z INFO 48501 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/home/ubuntu/qwen3/token_generation_model/_tp0_bk3/neuronxcc-m7dyulmn/sg00", "state_id": "sg00"}' --pipeline BIRLinker +2025-08-07T13:57:05Z INFO 48501 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/token_generation_model/_tp0_bk3/neuronxcc-m7dyulmn +2025-08-07T13:57:05Z INFO 48501 [job.BIRLinker.0]: Linking not needed. Netlist doesnt exist +2025-08-07T13:57:05Z INFO 48501 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-08-07T13:57:05Z INFO 48501 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-08-07T13:57:05Z INFO 48501 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-08-07T13:57:05Z INFO 48501 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-08-07T13:57:05Z INFO 48501 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-08-07T13:57:05Z INFO 48501 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-08-07T13:57:05Z INFO 48501 [job.NeffWrapper.0]: Processing input #0 +2025-08-07T13:57:05Z INFO 48501 [job.NeffWrapper.0]: Start NeffWrapper +2025-08-07T13:57:05Z INFO 48501 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_7_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.hlo_module.pb --neff /home/ubuntu/qwen3/token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.neff --io_transposes /home/ubuntu/qwen3/token_generation_model/_tp0_bk3/neuronxcc-m7dyulmn/io_transposes.json --output /home/ubuntu/qwen3/token_generation_model/_tp0_bk3/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/token_generation_model/_tp0_bk3/neuronxcc-m7dyulmn/hlo_netlist.json +2025-08-07T13:57:05Z INFO 48501 [job.NeffWrapper.0]: Could not open file: /home/ubuntu/qwen3/token_generation_model/_tp0_bk3/neuronxcc-m7dyulmn/hlo_netlist.json +There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-08-07T13:57:05Z INFO 48501 [job.NeffWrapper.0]: Job #0 finished +2025-08-07T13:57:05Z INFO 48501 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-08-07T13:57:05Z INFO 48501 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-08-07T13:57:05Z INFO 48501 [pipeline.Pipeline.0]: Job #0 finished +2025-08-07T13:57:05Z INFO 47986 [root]: Subcommand returned with exitcode=0 diff --git a/token_generation_model/_tp0_bk3/metaneff.pb b/token_generation_model/_tp0_bk3/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..ba941ccbbee873fa667b69aa6087b958f441e66e --- /dev/null +++ b/token_generation_model/_tp0_bk3/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3116fcf330e4579212515d0cf68d593af7a8d4b5d8d6d9d1b21b201317c2fcee +size 984551 diff --git a/token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.hlo_module.pb b/token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..4597072a950f422e6c9fbfbab6971f95b419aed2 --- /dev/null +++ b/token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:632099279834ad49336b20fe638b015f3bd9f3d5379c77ef3a7fbaef8cea450e +size 1063359 diff --git a/token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.neff b/token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.neff new file mode 100644 index 0000000000000000000000000000000000000000..5c280045ee159689e7f6140a94f0737a97c3900e --- /dev/null +++ b/token_generation_model/_tp0_bk3/model.MODULE_d3ed4857bd8baeff8023+b05cff0a.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:487e4feb4f697b81662129763fb99bc19ea4fd42678a67a9e895c701ddb22265 +size 6124544 diff --git a/token_generation_model/_tp0_bk3/neuron_config.json b/token_generation_model/_tp0_bk3/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..56af61cf005a7246799b4e219f0aaec8aa76f84e --- /dev/null +++ b/token_generation_model/_tp0_bk3/neuron_config.json @@ -0,0 +1,220 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "Qwen/Qwen3-8B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 12288, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 36, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 1, + "bucket_n_active_tokens": false, + "buckets": [ + 1024 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 1, + "chunked_prefill_config": null, + "context_encoding_buckets": null, + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": false, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 1, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": false, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 1, + "lora_config": null, + "max_batch_size": 1, + "max_context_length": 1024, + "max_length": 1024, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 1, + "n_positions": 1024, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 1024, + "pa_num_blocks": 1, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 1024, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 1, + "speculation_length": 0, + "start_rank_id": 0, + "target": null, + "tile_cc": false, + "tkg_batch_size": 1, + "token_generation_buckets": [ + 1024 + ], + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 32, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/weights/tp0_sharded_checkpoint.safetensors b/weights/tp0_sharded_checkpoint.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..21070f559096d4349d93f985ce2e1dc359981a9c --- /dev/null +++ b/weights/tp0_sharded_checkpoint.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4877d7d89bdfdd5be52c247b3b385259dc7ec681b73afe86869d1c63a818d966 +size 135 diff --git a/weights/tp1_sharded_checkpoint.safetensors b/weights/tp1_sharded_checkpoint.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dbdd1e7c272b1135f429937b46a18cfeed597f30 --- /dev/null +++ b/weights/tp1_sharded_checkpoint.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b199546177db3fbfff12b306890012363eaef2e0c65937c1fa18a4135541fa64 +size 135