hqfang commited on 12 days ago

Commit

2da8312

verified ·

1 Parent(s): 41aadd0

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

added_tokens.json +594 -0
chat_template.jinja +1 -0
config.json +147 -0
configuration_molmoact.py +355 -0
generation_config.json +6 -0
image_processing_molmoact.py +959 -0
merges.txt +0 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +636 -0
modeling_molmoact.py +2100 -0
preprocessor_config.json +27 -0
processing_molmoact.py +465 -0
processor_config.json +14 -0
special_tokens_map.json +3266 -0
tokenizer.json +0 -0
tokenizer_config.json +0 -0
vocab.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,594 @@

+{
+  "<DEPTH_0>": 100280,
+  "<DEPTH_100>": 100380,
+  "<DEPTH_101>": 100381,
+  "<DEPTH_102>": 100382,
+  "<DEPTH_103>": 100383,
+  "<DEPTH_104>": 100384,
+  "<DEPTH_105>": 100385,
+  "<DEPTH_106>": 100386,
+  "<DEPTH_107>": 100387,
+  "<DEPTH_108>": 100388,
+  "<DEPTH_109>": 100389,
+  "<DEPTH_10>": 100290,
+  "<DEPTH_110>": 100390,
+  "<DEPTH_111>": 100391,
+  "<DEPTH_112>": 100392,
+  "<DEPTH_113>": 100393,
+  "<DEPTH_114>": 100394,
+  "<DEPTH_115>": 100395,
+  "<DEPTH_116>": 100396,
+  "<DEPTH_117>": 100397,
+  "<DEPTH_118>": 100398,
+  "<DEPTH_119>": 100399,
+  "<DEPTH_11>": 100291,
+  "<DEPTH_120>": 100400,
+  "<DEPTH_121>": 100401,
+  "<DEPTH_122>": 100402,
+  "<DEPTH_123>": 100403,
+  "<DEPTH_124>": 100404,
+  "<DEPTH_125>": 100405,
+  "<DEPTH_126>": 100406,
+  "<DEPTH_127>": 100407,
+  "<DEPTH_12>": 100292,
+  "<DEPTH_13>": 100293,
+  "<DEPTH_14>": 100294,
+  "<DEPTH_15>": 100295,
+  "<DEPTH_16>": 100296,
+  "<DEPTH_17>": 100297,
+  "<DEPTH_18>": 100298,
+  "<DEPTH_19>": 100299,
+  "<DEPTH_1>": 100281,
+  "<DEPTH_20>": 100300,
+  "<DEPTH_21>": 100301,
+  "<DEPTH_22>": 100302,
+  "<DEPTH_23>": 100303,
+  "<DEPTH_24>": 100304,
+  "<DEPTH_25>": 100305,
+  "<DEPTH_26>": 100306,
+  "<DEPTH_27>": 100307,
+  "<DEPTH_28>": 100308,
+  "<DEPTH_29>": 100309,
+  "<DEPTH_2>": 100282,
+  "<DEPTH_30>": 100310,
+  "<DEPTH_31>": 100311,
+  "<DEPTH_32>": 100312,
+  "<DEPTH_33>": 100313,
+  "<DEPTH_34>": 100314,
+  "<DEPTH_35>": 100315,
+  "<DEPTH_36>": 100316,
+  "<DEPTH_37>": 100317,
+  "<DEPTH_38>": 100318,
+  "<DEPTH_39>": 100319,
+  "<DEPTH_3>": 100283,
+  "<DEPTH_40>": 100320,
+  "<DEPTH_41>": 100321,
+  "<DEPTH_42>": 100322,
+  "<DEPTH_43>": 100323,
+  "<DEPTH_44>": 100324,
+  "<DEPTH_45>": 100325,
+  "<DEPTH_46>": 100326,
+  "<DEPTH_47>": 100327,
+  "<DEPTH_48>": 100328,
+  "<DEPTH_49>": 100329,
+  "<DEPTH_4>": 100284,
+  "<DEPTH_50>": 100330,
+  "<DEPTH_51>": 100331,
+  "<DEPTH_52>": 100332,
+  "<DEPTH_53>": 100333,
+  "<DEPTH_54>": 100334,
+  "<DEPTH_55>": 100335,
+  "<DEPTH_56>": 100336,
+  "<DEPTH_57>": 100337,
+  "<DEPTH_58>": 100338,
+  "<DEPTH_59>": 100339,
+  "<DEPTH_5>": 100285,
+  "<DEPTH_60>": 100340,
+  "<DEPTH_61>": 100341,
+  "<DEPTH_62>": 100342,
+  "<DEPTH_63>": 100343,
+  "<DEPTH_64>": 100344,
+  "<DEPTH_65>": 100345,
+  "<DEPTH_66>": 100346,
+  "<DEPTH_67>": 100347,
+  "<DEPTH_68>": 100348,
+  "<DEPTH_69>": 100349,
+  "<DEPTH_6>": 100286,
+  "<DEPTH_70>": 100350,
+  "<DEPTH_71>": 100351,
+  "<DEPTH_72>": 100352,
+  "<DEPTH_73>": 100353,
+  "<DEPTH_74>": 100354,
+  "<DEPTH_75>": 100355,
+  "<DEPTH_76>": 100356,
+  "<DEPTH_77>": 100357,
+  "<DEPTH_78>": 100358,
+  "<DEPTH_79>": 100359,
+  "<DEPTH_7>": 100287,
+  "<DEPTH_80>": 100360,
+  "<DEPTH_81>": 100361,
+  "<DEPTH_82>": 100362,
+  "<DEPTH_83>": 100363,
+  "<DEPTH_84>": 100364,
+  "<DEPTH_85>": 100365,
+  "<DEPTH_86>": 100366,
+  "<DEPTH_87>": 100367,
+  "<DEPTH_88>": 100368,
+  "<DEPTH_89>": 100369,
+  "<DEPTH_8>": 100288,
+  "<DEPTH_90>": 100370,
+  "<DEPTH_91>": 100371,
+  "<DEPTH_92>": 100372,
+  "<DEPTH_93>": 100373,
+  "<DEPTH_94>": 100374,
+  "<DEPTH_95>": 100375,
+  "<DEPTH_96>": 100376,
+  "<DEPTH_97>": 100377,
+  "<DEPTH_98>": 100378,
+  "<DEPTH_99>": 100379,
+  "<DEPTH_9>": 100289,
+  "<DEPTH_END>": 100279,
+  "<DEPTH_START>": 100278,
+  "<im_col>": 100867,
+  "<im_end>": 100865,
+  "<im_low>": 100869,
+  "<im_patch>": 100866,
+  "<im_start>": 100864,
+  "<|image|>": 100868,
+  "|<EXTRA_TOKENS_0>|": 100408,
+  "|<EXTRA_TOKENS_100>|": 100508,
+  "|<EXTRA_TOKENS_101>|": 100509,
+  "|<EXTRA_TOKENS_102>|": 100510,
+  "|<EXTRA_TOKENS_103>|": 100511,
+  "|<EXTRA_TOKENS_104>|": 100512,
+  "|<EXTRA_TOKENS_105>|": 100513,
+  "|<EXTRA_TOKENS_106>|": 100514,
+  "|<EXTRA_TOKENS_107>|": 100515,
+  "|<EXTRA_TOKENS_108>|": 100516,
+  "|<EXTRA_TOKENS_109>|": 100517,
+  "|<EXTRA_TOKENS_10>|": 100418,
+  "|<EXTRA_TOKENS_110>|": 100518,
+  "|<EXTRA_TOKENS_111>|": 100519,
+  "|<EXTRA_TOKENS_112>|": 100520,
+  "|<EXTRA_TOKENS_113>|": 100521,
+  "|<EXTRA_TOKENS_114>|": 100522,
+  "|<EXTRA_TOKENS_115>|": 100523,
+  "|<EXTRA_TOKENS_116>|": 100524,
+  "|<EXTRA_TOKENS_117>|": 100525,
+  "|<EXTRA_TOKENS_118>|": 100526,
+  "|<EXTRA_TOKENS_119>|": 100527,
+  "|<EXTRA_TOKENS_11>|": 100419,
+  "|<EXTRA_TOKENS_120>|": 100528,
+  "|<EXTRA_TOKENS_121>|": 100529,
+  "|<EXTRA_TOKENS_122>|": 100530,
+  "|<EXTRA_TOKENS_123>|": 100531,
+  "|<EXTRA_TOKENS_124>|": 100532,
+  "|<EXTRA_TOKENS_125>|": 100533,
+  "|<EXTRA_TOKENS_126>|": 100534,
+  "|<EXTRA_TOKENS_127>|": 100535,
+  "|<EXTRA_TOKENS_128>|": 100536,
+  "|<EXTRA_TOKENS_129>|": 100537,
+  "|<EXTRA_TOKENS_12>|": 100420,
+  "|<EXTRA_TOKENS_130>|": 100538,
+  "|<EXTRA_TOKENS_131>|": 100539,
+  "|<EXTRA_TOKENS_132>|": 100540,
+  "|<EXTRA_TOKENS_133>|": 100541,
+  "|<EXTRA_TOKENS_134>|": 100542,
+  "|<EXTRA_TOKENS_135>|": 100543,
+  "|<EXTRA_TOKENS_136>|": 100544,
+  "|<EXTRA_TOKENS_137>|": 100545,
+  "|<EXTRA_TOKENS_138>|": 100546,
+  "|<EXTRA_TOKENS_139>|": 100547,
+  "|<EXTRA_TOKENS_13>|": 100421,
+  "|<EXTRA_TOKENS_140>|": 100548,
+  "|<EXTRA_TOKENS_141>|": 100549,
+  "|<EXTRA_TOKENS_142>|": 100550,
+  "|<EXTRA_TOKENS_143>|": 100551,
+  "|<EXTRA_TOKENS_144>|": 100552,
+  "|<EXTRA_TOKENS_145>|": 100553,
+  "|<EXTRA_TOKENS_146>|": 100554,
+  "|<EXTRA_TOKENS_147>|": 100555,
+  "|<EXTRA_TOKENS_148>|": 100556,
+  "|<EXTRA_TOKENS_149>|": 100557,
+  "|<EXTRA_TOKENS_14>|": 100422,
+  "|<EXTRA_TOKENS_150>|": 100558,
+  "|<EXTRA_TOKENS_151>|": 100559,
+  "|<EXTRA_TOKENS_152>|": 100560,
+  "|<EXTRA_TOKENS_153>|": 100561,
+  "|<EXTRA_TOKENS_154>|": 100562,
+  "|<EXTRA_TOKENS_155>|": 100563,
+  "|<EXTRA_TOKENS_156>|": 100564,
+  "|<EXTRA_TOKENS_157>|": 100565,
+  "|<EXTRA_TOKENS_158>|": 100566,
+  "|<EXTRA_TOKENS_159>|": 100567,
+  "|<EXTRA_TOKENS_15>|": 100423,
+  "|<EXTRA_TOKENS_160>|": 100568,
+  "|<EXTRA_TOKENS_161>|": 100569,
+  "|<EXTRA_TOKENS_162>|": 100570,
+  "|<EXTRA_TOKENS_163>|": 100571,
+  "|<EXTRA_TOKENS_164>|": 100572,
+  "|<EXTRA_TOKENS_165>|": 100573,
+  "|<EXTRA_TOKENS_166>|": 100574,
+  "|<EXTRA_TOKENS_167>|": 100575,
+  "|<EXTRA_TOKENS_168>|": 100576,
+  "|<EXTRA_TOKENS_169>|": 100577,
+  "|<EXTRA_TOKENS_16>|": 100424,
+  "|<EXTRA_TOKENS_170>|": 100578,
+  "|<EXTRA_TOKENS_171>|": 100579,
+  "|<EXTRA_TOKENS_172>|": 100580,
+  "|<EXTRA_TOKENS_173>|": 100581,
+  "|<EXTRA_TOKENS_174>|": 100582,
+  "|<EXTRA_TOKENS_175>|": 100583,
+  "|<EXTRA_TOKENS_176>|": 100584,
+  "|<EXTRA_TOKENS_177>|": 100585,
+  "|<EXTRA_TOKENS_178>|": 100586,
+  "|<EXTRA_TOKENS_179>|": 100587,
+  "|<EXTRA_TOKENS_17>|": 100425,
+  "|<EXTRA_TOKENS_180>|": 100588,
+  "|<EXTRA_TOKENS_181>|": 100589,
+  "|<EXTRA_TOKENS_182>|": 100590,
+  "|<EXTRA_TOKENS_183>|": 100591,
+  "|<EXTRA_TOKENS_184>|": 100592,
+  "|<EXTRA_TOKENS_185>|": 100593,
+  "|<EXTRA_TOKENS_186>|": 100594,
+  "|<EXTRA_TOKENS_187>|": 100595,
+  "|<EXTRA_TOKENS_188>|": 100596,
+  "|<EXTRA_TOKENS_189>|": 100597,
+  "|<EXTRA_TOKENS_18>|": 100426,
+  "|<EXTRA_TOKENS_190>|": 100598,
+  "|<EXTRA_TOKENS_191>|": 100599,
+  "|<EXTRA_TOKENS_192>|": 100600,
+  "|<EXTRA_TOKENS_193>|": 100601,
+  "|<EXTRA_TOKENS_194>|": 100602,
+  "|<EXTRA_TOKENS_195>|": 100603,
+  "|<EXTRA_TOKENS_196>|": 100604,
+  "|<EXTRA_TOKENS_197>|": 100605,
+  "|<EXTRA_TOKENS_198>|": 100606,
+  "|<EXTRA_TOKENS_199>|": 100607,
+  "|<EXTRA_TOKENS_19>|": 100427,
+  "|<EXTRA_TOKENS_1>|": 100409,
+  "|<EXTRA_TOKENS_200>|": 100608,
+  "|<EXTRA_TOKENS_201>|": 100609,
+  "|<EXTRA_TOKENS_202>|": 100610,
+  "|<EXTRA_TOKENS_203>|": 100611,
+  "|<EXTRA_TOKENS_204>|": 100612,
+  "|<EXTRA_TOKENS_205>|": 100613,
+  "|<EXTRA_TOKENS_206>|": 100614,
+  "|<EXTRA_TOKENS_207>|": 100615,
+  "|<EXTRA_TOKENS_208>|": 100616,
+  "|<EXTRA_TOKENS_209>|": 100617,
+  "|<EXTRA_TOKENS_20>|": 100428,
+  "|<EXTRA_TOKENS_210>|": 100618,
+  "|<EXTRA_TOKENS_211>|": 100619,
+  "|<EXTRA_TOKENS_212>|": 100620,
+  "|<EXTRA_TOKENS_213>|": 100621,
+  "|<EXTRA_TOKENS_214>|": 100622,
+  "|<EXTRA_TOKENS_215>|": 100623,
+  "|<EXTRA_TOKENS_216>|": 100624,
+  "|<EXTRA_TOKENS_217>|": 100625,
+  "|<EXTRA_TOKENS_218>|": 100626,
+  "|<EXTRA_TOKENS_219>|": 100627,
+  "|<EXTRA_TOKENS_21>|": 100429,
+  "|<EXTRA_TOKENS_220>|": 100628,
+  "|<EXTRA_TOKENS_221>|": 100629,
+  "|<EXTRA_TOKENS_222>|": 100630,
+  "|<EXTRA_TOKENS_223>|": 100631,
+  "|<EXTRA_TOKENS_224>|": 100632,
+  "|<EXTRA_TOKENS_225>|": 100633,
+  "|<EXTRA_TOKENS_226>|": 100634,
+  "|<EXTRA_TOKENS_227>|": 100635,
+  "|<EXTRA_TOKENS_228>|": 100636,
+  "|<EXTRA_TOKENS_229>|": 100637,
+  "|<EXTRA_TOKENS_22>|": 100430,
+  "|<EXTRA_TOKENS_230>|": 100638,
+  "|<EXTRA_TOKENS_231>|": 100639,
+  "|<EXTRA_TOKENS_232>|": 100640,
+  "|<EXTRA_TOKENS_233>|": 100641,
+  "|<EXTRA_TOKENS_234>|": 100642,
+  "|<EXTRA_TOKENS_235>|": 100643,
+  "|<EXTRA_TOKENS_236>|": 100644,
+  "|<EXTRA_TOKENS_237>|": 100645,
+  "|<EXTRA_TOKENS_238>|": 100646,
+  "|<EXTRA_TOKENS_239>|": 100647,
+  "|<EXTRA_TOKENS_23>|": 100431,
+  "|<EXTRA_TOKENS_240>|": 100648,
+  "|<EXTRA_TOKENS_241>|": 100649,
+  "|<EXTRA_TOKENS_242>|": 100650,
+  "|<EXTRA_TOKENS_243>|": 100651,
+  "|<EXTRA_TOKENS_244>|": 100652,
+  "|<EXTRA_TOKENS_245>|": 100653,
+  "|<EXTRA_TOKENS_246>|": 100654,
+  "|<EXTRA_TOKENS_247>|": 100655,
+  "|<EXTRA_TOKENS_248>|": 100656,
+  "|<EXTRA_TOKENS_249>|": 100657,
+  "|<EXTRA_TOKENS_24>|": 100432,
+  "|<EXTRA_TOKENS_250>|": 100658,
+  "|<EXTRA_TOKENS_251>|": 100659,
+  "|<EXTRA_TOKENS_252>|": 100660,
+  "|<EXTRA_TOKENS_253>|": 100661,
+  "|<EXTRA_TOKENS_254>|": 100662,
+  "|<EXTRA_TOKENS_255>|": 100663,
+  "|<EXTRA_TOKENS_256>|": 100664,
+  "|<EXTRA_TOKENS_257>|": 100665,
+  "|<EXTRA_TOKENS_258>|": 100666,
+  "|<EXTRA_TOKENS_259>|": 100667,
+  "|<EXTRA_TOKENS_25>|": 100433,
+  "|<EXTRA_TOKENS_260>|": 100668,
+  "|<EXTRA_TOKENS_261>|": 100669,
+  "|<EXTRA_TOKENS_262>|": 100670,
+  "|<EXTRA_TOKENS_263>|": 100671,
+  "|<EXTRA_TOKENS_264>|": 100672,
+  "|<EXTRA_TOKENS_265>|": 100673,
+  "|<EXTRA_TOKENS_266>|": 100674,
+  "|<EXTRA_TOKENS_267>|": 100675,
+  "|<EXTRA_TOKENS_268>|": 100676,
+  "|<EXTRA_TOKENS_269>|": 100677,
+  "|<EXTRA_TOKENS_26>|": 100434,
+  "|<EXTRA_TOKENS_270>|": 100678,
+  "|<EXTRA_TOKENS_271>|": 100679,
+  "|<EXTRA_TOKENS_272>|": 100680,
+  "|<EXTRA_TOKENS_273>|": 100681,
+  "|<EXTRA_TOKENS_274>|": 100682,
+  "|<EXTRA_TOKENS_275>|": 100683,
+  "|<EXTRA_TOKENS_276>|": 100684,
+  "|<EXTRA_TOKENS_277>|": 100685,
+  "|<EXTRA_TOKENS_278>|": 100686,
+  "|<EXTRA_TOKENS_279>|": 100687,
+  "|<EXTRA_TOKENS_27>|": 100435,
+  "|<EXTRA_TOKENS_280>|": 100688,
+  "|<EXTRA_TOKENS_281>|": 100689,
+  "|<EXTRA_TOKENS_282>|": 100690,
+  "|<EXTRA_TOKENS_283>|": 100691,
+  "|<EXTRA_TOKENS_284>|": 100692,
+  "|<EXTRA_TOKENS_285>|": 100693,
+  "|<EXTRA_TOKENS_286>|": 100694,
+  "|<EXTRA_TOKENS_287>|": 100695,
+  "|<EXTRA_TOKENS_288>|": 100696,
+  "|<EXTRA_TOKENS_289>|": 100697,
+  "|<EXTRA_TOKENS_28>|": 100436,
+  "|<EXTRA_TOKENS_290>|": 100698,
+  "|<EXTRA_TOKENS_291>|": 100699,
+  "|<EXTRA_TOKENS_292>|": 100700,
+  "|<EXTRA_TOKENS_293>|": 100701,
+  "|<EXTRA_TOKENS_294>|": 100702,
+  "|<EXTRA_TOKENS_295>|": 100703,
+  "|<EXTRA_TOKENS_296>|": 100704,
+  "|<EXTRA_TOKENS_297>|": 100705,
+  "|<EXTRA_TOKENS_298>|": 100706,
+  "|<EXTRA_TOKENS_299>|": 100707,
+  "|<EXTRA_TOKENS_29>|": 100437,
+  "|<EXTRA_TOKENS_2>|": 100410,
+  "|<EXTRA_TOKENS_300>|": 100708,
+  "|<EXTRA_TOKENS_301>|": 100709,
+  "|<EXTRA_TOKENS_302>|": 100710,
+  "|<EXTRA_TOKENS_303>|": 100711,
+  "|<EXTRA_TOKENS_304>|": 100712,
+  "|<EXTRA_TOKENS_305>|": 100713,
+  "|<EXTRA_TOKENS_306>|": 100714,
+  "|<EXTRA_TOKENS_307>|": 100715,
+  "|<EXTRA_TOKENS_308>|": 100716,
+  "|<EXTRA_TOKENS_309>|": 100717,
+  "|<EXTRA_TOKENS_30>|": 100438,
+  "|<EXTRA_TOKENS_310>|": 100718,
+  "|<EXTRA_TOKENS_311>|": 100719,
+  "|<EXTRA_TOKENS_312>|": 100720,
+  "|<EXTRA_TOKENS_313>|": 100721,
+  "|<EXTRA_TOKENS_314>|": 100722,
+  "|<EXTRA_TOKENS_315>|": 100723,
+  "|<EXTRA_TOKENS_316>|": 100724,
+  "|<EXTRA_TOKENS_317>|": 100725,
+  "|<EXTRA_TOKENS_318>|": 100726,
+  "|<EXTRA_TOKENS_319>|": 100727,
+  "|<EXTRA_TOKENS_31>|": 100439,
+  "|<EXTRA_TOKENS_320>|": 100728,
+  "|<EXTRA_TOKENS_321>|": 100729,
+  "|<EXTRA_TOKENS_322>|": 100730,
+  "|<EXTRA_TOKENS_323>|": 100731,
+  "|<EXTRA_TOKENS_324>|": 100732,
+  "|<EXTRA_TOKENS_325>|": 100733,
+  "|<EXTRA_TOKENS_326>|": 100734,
+  "|<EXTRA_TOKENS_327>|": 100735,
+  "|<EXTRA_TOKENS_328>|": 100736,
+  "|<EXTRA_TOKENS_329>|": 100737,
+  "|<EXTRA_TOKENS_32>|": 100440,
+  "|<EXTRA_TOKENS_330>|": 100738,
+  "|<EXTRA_TOKENS_331>|": 100739,
+  "|<EXTRA_TOKENS_332>|": 100740,
+  "|<EXTRA_TOKENS_333>|": 100741,
+  "|<EXTRA_TOKENS_334>|": 100742,
+  "|<EXTRA_TOKENS_335>|": 100743,
+  "|<EXTRA_TOKENS_336>|": 100744,
+  "|<EXTRA_TOKENS_337>|": 100745,
+  "|<EXTRA_TOKENS_338>|": 100746,
+  "|<EXTRA_TOKENS_339>|": 100747,
+  "|<EXTRA_TOKENS_33>|": 100441,
+  "|<EXTRA_TOKENS_340>|": 100748,
+  "|<EXTRA_TOKENS_341>|": 100749,
+  "|<EXTRA_TOKENS_342>|": 100750,
+  "|<EXTRA_TOKENS_343>|": 100751,
+  "|<EXTRA_TOKENS_344>|": 100752,
+  "|<EXTRA_TOKENS_345>|": 100753,
+  "|<EXTRA_TOKENS_346>|": 100754,
+  "|<EXTRA_TOKENS_347>|": 100755,
+  "|<EXTRA_TOKENS_348>|": 100756,
+  "|<EXTRA_TOKENS_349>|": 100757,
+  "|<EXTRA_TOKENS_34>|": 100442,
+  "|<EXTRA_TOKENS_350>|": 100758,
+  "|<EXTRA_TOKENS_351>|": 100759,
+  "|<EXTRA_TOKENS_352>|": 100760,
+  "|<EXTRA_TOKENS_353>|": 100761,
+  "|<EXTRA_TOKENS_354>|": 100762,
+  "|<EXTRA_TOKENS_355>|": 100763,
+  "|<EXTRA_TOKENS_356>|": 100764,
+  "|<EXTRA_TOKENS_357>|": 100765,
+  "|<EXTRA_TOKENS_358>|": 100766,
+  "|<EXTRA_TOKENS_359>|": 100767,
+  "|<EXTRA_TOKENS_35>|": 100443,
+  "|<EXTRA_TOKENS_360>|": 100768,
+  "|<EXTRA_TOKENS_361>|": 100769,
+  "|<EXTRA_TOKENS_362>|": 100770,
+  "|<EXTRA_TOKENS_363>|": 100771,
+  "|<EXTRA_TOKENS_364>|": 100772,
+  "|<EXTRA_TOKENS_365>|": 100773,
+  "|<EXTRA_TOKENS_366>|": 100774,
+  "|<EXTRA_TOKENS_367>|": 100775,
+  "|<EXTRA_TOKENS_368>|": 100776,
+  "|<EXTRA_TOKENS_369>|": 100777,
+  "|<EXTRA_TOKENS_36>|": 100444,
+  "|<EXTRA_TOKENS_370>|": 100778,
+  "|<EXTRA_TOKENS_371>|": 100779,
+  "|<EXTRA_TOKENS_372>|": 100780,
+  "|<EXTRA_TOKENS_373>|": 100781,
+  "|<EXTRA_TOKENS_374>|": 100782,
+  "|<EXTRA_TOKENS_375>|": 100783,
+  "|<EXTRA_TOKENS_376>|": 100784,
+  "|<EXTRA_TOKENS_377>|": 100785,
+  "|<EXTRA_TOKENS_378>|": 100786,
+  "|<EXTRA_TOKENS_379>|": 100787,
+  "|<EXTRA_TOKENS_37>|": 100445,
+  "|<EXTRA_TOKENS_380>|": 100788,
+  "|<EXTRA_TOKENS_381>|": 100789,
+  "|<EXTRA_TOKENS_382>|": 100790,
+  "|<EXTRA_TOKENS_383>|": 100791,
+  "|<EXTRA_TOKENS_384>|": 100792,
+  "|<EXTRA_TOKENS_385>|": 100793,
+  "|<EXTRA_TOKENS_386>|": 100794,
+  "|<EXTRA_TOKENS_387>|": 100795,
+  "|<EXTRA_TOKENS_388>|": 100796,
+  "|<EXTRA_TOKENS_389>|": 100797,
+  "|<EXTRA_TOKENS_38>|": 100446,
+  "|<EXTRA_TOKENS_390>|": 100798,
+  "|<EXTRA_TOKENS_391>|": 100799,
+  "|<EXTRA_TOKENS_392>|": 100800,
+  "|<EXTRA_TOKENS_393>|": 100801,
+  "|<EXTRA_TOKENS_394>|": 100802,
+  "|<EXTRA_TOKENS_395>|": 100803,
+  "|<EXTRA_TOKENS_396>|": 100804,
+  "|<EXTRA_TOKENS_397>|": 100805,
+  "|<EXTRA_TOKENS_398>|": 100806,
+  "|<EXTRA_TOKENS_399>|": 100807,
+  "|<EXTRA_TOKENS_39>|": 100447,
+  "|<EXTRA_TOKENS_3>|": 100411,
+  "|<EXTRA_TOKENS_400>|": 100808,
+  "|<EXTRA_TOKENS_401>|": 100809,
+  "|<EXTRA_TOKENS_402>|": 100810,
+  "|<EXTRA_TOKENS_403>|": 100811,
+  "|<EXTRA_TOKENS_404>|": 100812,
+  "|<EXTRA_TOKENS_405>|": 100813,
+  "|<EXTRA_TOKENS_406>|": 100814,
+  "|<EXTRA_TOKENS_407>|": 100815,
+  "|<EXTRA_TOKENS_408>|": 100816,
+  "|<EXTRA_TOKENS_409>|": 100817,
+  "|<EXTRA_TOKENS_40>|": 100448,
+  "|<EXTRA_TOKENS_410>|": 100818,
+  "|<EXTRA_TOKENS_411>|": 100819,
+  "|<EXTRA_TOKENS_412>|": 100820,
+  "|<EXTRA_TOKENS_413>|": 100821,
+  "|<EXTRA_TOKENS_414>|": 100822,
+  "|<EXTRA_TOKENS_415>|": 100823,
+  "|<EXTRA_TOKENS_416>|": 100824,
+  "|<EXTRA_TOKENS_417>|": 100825,
+  "|<EXTRA_TOKENS_418>|": 100826,
+  "|<EXTRA_TOKENS_419>|": 100827,
+  "|<EXTRA_TOKENS_41>|": 100449,
+  "|<EXTRA_TOKENS_420>|": 100828,
+  "|<EXTRA_TOKENS_421>|": 100829,
+  "|<EXTRA_TOKENS_422>|": 100830,
+  "|<EXTRA_TOKENS_423>|": 100831,
+  "|<EXTRA_TOKENS_424>|": 100832,
+  "|<EXTRA_TOKENS_425>|": 100833,
+  "|<EXTRA_TOKENS_426>|": 100834,
+  "|<EXTRA_TOKENS_427>|": 100835,
+  "|<EXTRA_TOKENS_428>|": 100836,
+  "|<EXTRA_TOKENS_429>|": 100837,
+  "|<EXTRA_TOKENS_42>|": 100450,
+  "|<EXTRA_TOKENS_430>|": 100838,
+  "|<EXTRA_TOKENS_431>|": 100839,
+  "|<EXTRA_TOKENS_432>|": 100840,
+  "|<EXTRA_TOKENS_433>|": 100841,
+  "|<EXTRA_TOKENS_434>|": 100842,
+  "|<EXTRA_TOKENS_435>|": 100843,
+  "|<EXTRA_TOKENS_436>|": 100844,
+  "|<EXTRA_TOKENS_437>|": 100845,
+  "|<EXTRA_TOKENS_438>|": 100846,
+  "|<EXTRA_TOKENS_439>|": 100847,
+  "|<EXTRA_TOKENS_43>|": 100451,
+  "|<EXTRA_TOKENS_440>|": 100848,
+  "|<EXTRA_TOKENS_441>|": 100849,
+  "|<EXTRA_TOKENS_442>|": 100850,
+  "|<EXTRA_TOKENS_443>|": 100851,
+  "|<EXTRA_TOKENS_444>|": 100852,
+  "|<EXTRA_TOKENS_445>|": 100853,
+  "|<EXTRA_TOKENS_446>|": 100854,
+  "|<EXTRA_TOKENS_447>|": 100855,
+  "|<EXTRA_TOKENS_448>|": 100856,
+  "|<EXTRA_TOKENS_449>|": 100857,
+  "|<EXTRA_TOKENS_44>|": 100452,
+  "|<EXTRA_TOKENS_450>|": 100858,
+  "|<EXTRA_TOKENS_451>|": 100859,
+  "|<EXTRA_TOKENS_452>|": 100860,
+  "|<EXTRA_TOKENS_453>|": 100861,
+  "|<EXTRA_TOKENS_454>|": 100862,
+  "|<EXTRA_TOKENS_455>|": 100863,
+  "|<EXTRA_TOKENS_45>|": 100453,
+  "|<EXTRA_TOKENS_46>|": 100454,
+  "|<EXTRA_TOKENS_47>|": 100455,
+  "|<EXTRA_TOKENS_48>|": 100456,
+  "|<EXTRA_TOKENS_49>|": 100457,
+  "|<EXTRA_TOKENS_4>|": 100412,
+  "|<EXTRA_TOKENS_50>|": 100458,
+  "|<EXTRA_TOKENS_51>|": 100459,
+  "|<EXTRA_TOKENS_52>|": 100460,
+  "|<EXTRA_TOKENS_53>|": 100461,
+  "|<EXTRA_TOKENS_54>|": 100462,
+  "|<EXTRA_TOKENS_55>|": 100463,
+  "|<EXTRA_TOKENS_56>|": 100464,
+  "|<EXTRA_TOKENS_57>|": 100465,
+  "|<EXTRA_TOKENS_58>|": 100466,
+  "|<EXTRA_TOKENS_59>|": 100467,
+  "|<EXTRA_TOKENS_5>|": 100413,
+  "|<EXTRA_TOKENS_60>|": 100468,
+  "|<EXTRA_TOKENS_61>|": 100469,
+  "|<EXTRA_TOKENS_62>|": 100470,
+  "|<EXTRA_TOKENS_63>|": 100471,
+  "|<EXTRA_TOKENS_64>|": 100472,
+  "|<EXTRA_TOKENS_65>|": 100473,
+  "|<EXTRA_TOKENS_66>|": 100474,
+  "|<EXTRA_TOKENS_67>|": 100475,
+  "|<EXTRA_TOKENS_68>|": 100476,
+  "|<EXTRA_TOKENS_69>|": 100477,
+  "|<EXTRA_TOKENS_6>|": 100414,
+  "|<EXTRA_TOKENS_70>|": 100478,
+  "|<EXTRA_TOKENS_71>|": 100479,
+  "|<EXTRA_TOKENS_72>|": 100480,
+  "|<EXTRA_TOKENS_73>|": 100481,
+  "|<EXTRA_TOKENS_74>|": 100482,
+  "|<EXTRA_TOKENS_75>|": 100483,
+  "|<EXTRA_TOKENS_76>|": 100484,
+  "|<EXTRA_TOKENS_77>|": 100485,
+  "|<EXTRA_TOKENS_78>|": 100486,
+  "|<EXTRA_TOKENS_79>|": 100487,
+  "|<EXTRA_TOKENS_7>|": 100415,
+  "|<EXTRA_TOKENS_80>|": 100488,
+  "|<EXTRA_TOKENS_81>|": 100489,
+  "|<EXTRA_TOKENS_82>|": 100490,
+  "|<EXTRA_TOKENS_83>|": 100491,
+  "|<EXTRA_TOKENS_84>|": 100492,
+  "|<EXTRA_TOKENS_85>|": 100493,
+  "|<EXTRA_TOKENS_86>|": 100494,
+  "|<EXTRA_TOKENS_87>|": 100495,
+  "|<EXTRA_TOKENS_88>|": 100496,
+  "|<EXTRA_TOKENS_89>|": 100497,
+  "|<EXTRA_TOKENS_8>|": 100416,
+  "|<EXTRA_TOKENS_90>|": 100498,
+  "|<EXTRA_TOKENS_91>|": 100499,
+  "|<EXTRA_TOKENS_92>|": 100500,
+  "|<EXTRA_TOKENS_93>|": 100501,
+  "|<EXTRA_TOKENS_94>|": 100502,
+  "|<EXTRA_TOKENS_95>|": 100503,
+  "|<EXTRA_TOKENS_96>|": 100504,
+  "|<EXTRA_TOKENS_97>|": 100505,
+  "|<EXTRA_TOKENS_98>|": 100506,
+  "|<EXTRA_TOKENS_99>|": 100507,
+  "|<EXTRA_TOKENS_9>|": 100417
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1 @@

+ {% for message in messages %}{%- if (loop.index % 2 == 1 and message['role'].lower() != 'user') or (loop.index % 2 == 0 and message['role'].lower() != 'assistant') -%}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{%- endif -%}{{ message['role'].capitalize() + ': ' }}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'text' %}{{ content['text'] }}{%- if not loop.last -%}{{ ' ' }}{%- endif -%}{% endif %}{% endfor %}{% endif %}{%- if not loop.last -%}{{ ' ' }}{%- endif -%}{% endfor %}{% if add_generation_prompt %}{{ ' Assistant:' }}{% endif %}

config.json ADDED Viewed

	@@ -0,0 +1,147 @@

+{
+  "adapter_config": {
+    "attention_dropout": 0.0,
+    "float32_attention": true,
+    "head_dim": 64,
+    "hidden_act": "silu",
+    "hidden_size": 1024,
+    "image_feature_dropout": 0.0,
+    "image_padding_embed": "pad_and_partial_pad",
+    "initializer_range": 0.02,
+    "intermediate_size": 11008,
+    "model_type": "",
+    "num_attention_heads": 16,
+    "num_key_value_heads": 16,
+    "residual_dropout": 0.0,
+    "text_hidden_size": 4096,
+    "vit_layers": [
+      -2,
+      -9
+    ]
+  },
+  "architectures": [
+    "MolmoActForActionReasoning"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_molmoact.MolmoActConfig",
+    "AutoModelForImageTextToText": "modeling_molmoact.MolmoActForActionReasoning"
+  },
+  "image_patch_id": 100866,
+  "initializer_range": 0.02,
+  "llm_config": {
+    "additional_vocab_size": 128,
+    "attention_dropout": 0.0,
+    "embedding_dropout": 0.0,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 11008,
+    "layer_norm_eps": 1e-06,
+    "max_position_embeddings": 4096,
+    "model_type": "molmoact_llm",
+    "norm_after": true,
+    "num_attention_heads": 32,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 32,
+    "qk_norm_type": "olmo",
+    "qkv_bias": false,
+    "residual_dropout": 0.0,
+    "rope_scaling": null,
+    "rope_theta": 500000.0,
+    "use_cache": true,
+    "use_qk_norm": true,
+    "vocab_size": 100864
+  },
+  "model_type": "molmoact",
+  "n_action_bins": 256,
+  "norm_stats": {
+    "molmoact": {
+      "action": {
+        "max": [
+          0.06042003631591797,
+          0.09417290985584259,
+          0.07019275426864624,
+          0.2616892158985138,
+          0.11751057207584381,
+          0.16968433558940887,
+          1.0
+        ],
+        "mean": [
+          0.0005706787342205644,
+          0.0002448957529850304,
+          -3.5987635783385485e-05,
+          0.00021597897284664214,
+          -0.0004896928439848125,
+          -0.000241481073317118,
+          0.5570635199546814
+        ],
+        "min": [
+          -0.07434078305959702,
+          -0.07339745759963989,
+          -0.06539416313171387,
+          -0.1688285619020462,
+          -0.10289879888296127,
+          -0.2667275667190552,
+          0.0
+        ],
+        "q01": [
+          -0.01538565568625927,
+          -0.021047022193670273,
+          -0.01688069850206375,
+          -0.044314172118902206,
+          -0.03890235349535942,
+          -0.04788423702120781,
+          0.0
+        ],
+        "q99": [
+          0.014661382883787155,
+          0.026515591889619827,
+          0.021398313343524933,
+          0.04216696694493294,
+          0.03401297703385353,
+          0.04957397282123566,
+          1.0
+        ],
+        "std": [
+          0.005207270849496126,
+          0.007506529800593853,
+          0.006415561307221651,
+          0.013248044066131115,
+          0.010928540490567684,
+          0.014873150736093521,
+          0.49715080857276917
+        ]
+      },
+      "num_entries": 1560068
+    }
+  },
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.3",
+  "use_cache": true,
+  "vit_config": {
+    "attention_dropout": 0.0,
+    "float32_attention": true,
+    "head_dim": 64,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 1024,
+    "image_default_input_size": [
+      336,
+      336
+    ],
+    "image_num_pos": 577,
+    "image_patch_size": 14,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-05,
+    "model_type": "molmoact_vit",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 23,
+    "num_key_value_heads": 16,
+    "patch_bias": false,
+    "pre_layernorm": true,
+    "residual_dropout": 0.0,
+    "use_cls_token": true
+  }
+}

configuration_molmoact.py ADDED Viewed

	@@ -0,0 +1,355 @@

+"""
+MolmoAct configuration
+"""
+from typing import Tuple, Optional, Dict, Any
+from transformers import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class MolmoActVitConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MolmoActVisionTransformer`].
+    It is used to instantiate a `MolmoActVisionTransformer` according to the specified arguments,
+    defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Example:
+    ```python
+    >>> from transformers import MolmoActVitConfig, MolmoActVisionTransformer
+    >>> # Initializing a MolmoActVitConfig
+    >>> configuration = MolmoActVitConfig()
+    >>> # Initializing a MolmoActVisionTransformer (with random weights)
+    >>> model = MolmoActVisionTransformer(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "molmoact_vit"
+    def __init__(
+        self,
+        hidden_size: int = 1152,
+        intermediate_size: int = 4304,
+        num_hidden_layers: int = 27,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 16,
+        head_dim: int = 72,
+        hidden_act: str = "gelu_pytorch_tanh",
+        layer_norm_eps: float = 1e-6,
+        image_default_input_size: Tuple[int, int] = (378, 378),
+        image_patch_size: int = 14,
+        image_num_pos: int = 577,
+        attention_dropout: float = 0.0,
+        residual_dropout: float = 0.0,
+        initializer_range: float = 0.02,
+        float32_attention: bool = True,
+        use_cls_token: bool = False,      # True for OpenCLIP
+        patch_bias: bool = True,          # False for OpenCLIP
+        pre_layernorm: bool = False,      # True for OpenCLIP
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.image_default_input_size = image_default_input_size
+        self.image_patch_size = image_patch_size
+        self.image_num_pos = image_num_pos
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.initializer_range = initializer_range
+        self.float32_attention = float32_attention
+        self.use_cls_token = use_cls_token
+        self.patch_bias = patch_bias
+        self.pre_layernorm = pre_layernorm
+    @property
+    def image_num_patch(self):
+        h, w = self.image_default_input_size
+        return h // self.image_patch_size, w // self.image_patch_size
+class MolmoActAdapterConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of MolmoActAdapter. With MolmoActVitConfig,
+    It is used to instantiate an MolmoActVisionBackbone according to the specified arguments,
+    defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Example:
+    ```python
+    >>> from transformers import MolmoActVitConfig, MolmoActAdapterConfig, MolmoActVisionBackbone
+    >>> # Initializing a MolmoActVitConfig and a MolmoActAdapterConfig
+    >>> vit_config = MolmoActVitConfig()
+    >>> adapter_config = MolmoPoolingConfig()
+    >>> # Initializing a MolmoActVisionBackbone (with random weights)
+    >>> model = MolmoActVisionBackbone(vit_config, adapter_config)
+    >>> # Accessing the model configuration
+    >>> vit_configuration = model.vit_config
+    >>> adapter_configuration = model.adapter_config
+    ```"""
+    def __init__(
+        self,
+        vit_layers: Tuple = (-3, -9),
+        hidden_size: int = 1152,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 16,
+        head_dim: int = 72,
+        float32_attention: bool = True,
+        attention_dropout: float = 0.0,
+        residual_dropout: float = 0.0,
+        hidden_act: str = "silu",
+        intermediate_size: int = 18944,
+        text_hidden_size: int = 3584,
+        image_feature_dropout: float = 0.0,
+        initializer_range: float = 0.02,
+        # pooling_mode: str = "indices",            # "indices" (SigLIP) or "2x2_attention" (OpenCLIP)
+        image_padding_embed: Optional[str] = None,  # e.g. "pad_and_partial_pad"
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vit_layers = vit_layers
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.float32_attention = float32_attention
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.text_hidden_size = text_hidden_size
+        self.image_feature_dropout = image_feature_dropout
+        self.initializer_range = initializer_range
+        # self.pooling_mode = pooling_mode
+        self.image_padding_embed = image_padding_embed
+class MolmoActLlmConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MolmoActLlm`]. It is used to instantiate a
+    `MolmoActLlm` according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Example:
+    ```python
+    >>> from transformers import MolmoActLlmConfig, MolmoActLlm
+    >>> # Initializing a MolmoActLlmConfig
+    >>> configuration = MolmoActLlmConfig()
+    >>> # Initializing a MolmoActLlm (with random weights)
+    >>> model = MolmoActLlm(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "molmoact_llm"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "blocks.*.self_attn.att_proj": "colwise",
+        "blocks.*.self_attn.attn_out": "rowwise",
+        "blocks.*.mlp.ff_proj": "colwise",
+        "blocks.*.mlp.ff_out": "rowwise",
+    }
+    base_model_pp_plan = {
+        "wte": (["input_ids"], ["inputs_embeds"]),
+        "blocks": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "ln_f": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        hidden_size: int = 3584,
+        num_attention_heads: int = 28,
+        num_key_value_heads: Optional[int] = 4,
+        head_dim: int = 128,
+        vocab_size: int = 152064,
+        additional_vocab_size: int = 128,
+        qkv_bias: bool = True,
+        num_hidden_layers: int = 48,
+        intermediate_size: int = 18944,
+        hidden_act: str = "silu",
+        embedding_dropout: float=0.0,
+        attention_dropout: float=0.0,
+        residual_dropout: float = 0.0,
+        max_position_embeddings: int = 4096,
+        rope_theta: float = 1000000.0,
+        rope_scaling: Dict[str, Any] = None,
+        use_qk_norm: bool = False,
+        qk_norm_type: str = "olmo",
+        layer_norm_eps: int = 1e-6,
+        norm_after: bool = False,
+        initializer_range: float = 0.02,
+        use_cache=True,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs
+        )
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.vocab_size = vocab_size
+        self.additional_vocab_size = additional_vocab_size
+        self.qkv_bias = qkv_bias
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.embedding_dropout = embedding_dropout
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.use_qk_norm = use_qk_norm
+        self.qk_norm_type = qk_norm_type
+        self.layer_norm_eps = layer_norm_eps
+        self.norm_after = norm_after
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        # Validate the correctness of rotary position embeddings parameters
+        rope_config_validation(self)
+class MolmoActConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MolmoActForActionReasoning`].
+    It is used to instantiate an MolmoAct model according to the specified arguments, defining the model architecture.
+    Example:
+    ```python
+    >>> from transformers import MolmoActConfig, MolmoActVitConfig, MolmoActAdapterConfig, MolmoActLlmConfig
+    >>> # Initializing a MolmoActVitConfig
+    >>> vit_config = MolmoActVitConfig()
+    >>> # Initializing a MolmoActAdapterConfig
+    >>> adapter_config = MolmoActAdapterConfig()
+    >>> # Initializing a MolmoActLlmConfig
+    >>> llm_config = MolmoActLlmConfig()
+    >>> # Initializing a MolmoActConfig
+    >>> configuration = MolmoActConfig(vit_config, adapter_config, llm_config, image_patch_id=152069)
+    >>> # Initializing a model
+    >>> model = MolmoActForActionReasoning(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "molmoact"
+    sub_configs = {
+        "llm_config": MolmoActLlmConfig,
+        "vit_config": MolmoActVitConfig,
+        "adapter_config": MolmoActAdapterConfig,
+    }
+    def __init__(
+        self,
+        vit_config: MolmoActVitConfig = None,
+        adapter_config: MolmoActAdapterConfig = None,
+        llm_config: MolmoActLlmConfig = None,
+        image_patch_id: int = None,
+        initializer_range: float = 0.02,
+        n_action_bins: int = 256,
+        norm_stats: dict = {},
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if vit_config is None:
+            self.vit_config = MolmoActVitConfig()
+        elif isinstance(vit_config, dict):
+            self.vit_config = MolmoActVitConfig(**vit_config)
+        else:
+            self.vit_config = vit_config
+        if adapter_config is None:
+            self.adapter_config = MolmoActAdapterConfig()
+        elif isinstance(adapter_config, dict):
+            self.adapter_config = MolmoActAdapterConfig(**adapter_config)
+        else:
+            self.adapter_config = adapter_config
+        if llm_config is None:
+            self.llm_config = MolmoActLlmConfig()
+        elif isinstance(llm_config, dict):
+            self.llm_config = MolmoActLlmConfig(**llm_config)
+        else:
+            self.llm_config = llm_config
+        self.image_patch_id = image_patch_id
+        self.initializer_range = initializer_range
+        self.n_action_bins = n_action_bins
+        self.norm_stats = norm_stats
+    @property
+    def image_num_patch(self):
+        assert self.vit_config is not None
+        return self.vit_config.image_num_patch
+    @property
+    def num_attention_heads(self):
+        return self.llm_config.num_attention_heads
+    @property
+    def num_key_value_heads(self):
+        return self.llm_config.num_key_value_heads
+    @property
+    def head_dim(self):
+        return self.llm_config.head_dim
+    @property
+    def num_hidden_layers(self):
+        return self.llm_config.num_hidden_layers
+    @property
+    def hidden_size(self):
+        return self.llm_config.hidden_size
+    @property
+    def vocab_size(self):
+        return self.llm_config.vocab_size
+    @property
+    def max_position_embeddings(self):
+        return self.llm_config.max_position_embeddings
+MolmoActVitConfig.register_for_auto_class()
+MolmoActAdapterConfig.register_for_auto_class()
+MolmoActLlmConfig.register_for_auto_class()
+MolmoActConfig.register_for_auto_class()

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token_id": 100257,
+  "eos_token_id": 100257,
+  "pad_token_id": 100277,
+  "transformers_version": "4.52.3"
+}

image_processing_molmoact.py ADDED Viewed

	@@ -0,0 +1,959 @@

+"""Image processor class for MolmoAct"""
+from typing import TYPE_CHECKING, Tuple, List, Optional, Union, Dict, Any
+import numpy as np
+import einops
+import torch
+import torchvision.transforms
+from torchvision.transforms import InterpolationMode
+from torchvision.transforms.functional import convert_image_dtype
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    is_valid_image,
+    valid_images,
+    to_numpy_array,
+)
+from transformers.image_transforms import convert_to_rgb, to_channel_dimension_format
+from transformers.processing_utils import ImagesKwargs
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.utils import logging
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.utils import TensorType, logging
+if TYPE_CHECKING:
+    from transformers.utils import TensorType, logging
+logger = logging.get_logger(__name__)
+def is_multi_image(image: Union[ImageInput, List[ImageInput]]) -> bool:
+    return isinstance(image, (list, tuple))
+def make_batched_images(images) -> List[ImageInput]:
+    """
+    Accepts images in list or nested list format.
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+    Returns:
+        list: A list of images or a list of lists of images.
+    """
+    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
+        return images
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+    elif is_valid_image(images):
+        return [images]
+    raise ValueError(f"Could not make batched images from {images}")
+def normalize_image(image: np.ndarray, normalize_mode: str) -> np.ndarray:
+    if normalize_mode == "openai":
+        image -= np.array(OPENAI_CLIP_MEAN, dtype=np.float32)[None, None, :]
+        image /= np.array(OPENAI_CLIP_STD, dtype=np.float32)[None, None, :]
+    elif normalize_mode == "siglip":
+        image = np.asarray(-1.0, dtype=np.float32) + image * np.asarray(2.0, dtype=np.float32)
+    elif normalize_mode == "dino":
+        image -= np.array([0.485, 0.456, 0.406], dtype=np.float32)[None, None, :]
+        image /= np.array([0.229, 0.224, 0.225], dtype=np.float32)[None, None, :]
+    else:
+        raise NotImplementedError(normalize_mode)
+    return image
+# Helper to ensure output_size is a 2-tuple of built-in Python ints
+def _ensure_pyint_size2(size):
+    """
+    Ensure `size` is a 2-tuple of built-in Python ints.
+    Accepts int, list/tuple, or numpy array of length 1 or 2.
+    """
+    import numpy as np
+    # If it's an array-like, normalize to length-2 tuple
+    if isinstance(size, (list, tuple, np.ndarray)):
+        if len(size) == 2:
+            return (int(size[0]), int(size[1]))
+        elif len(size) == 1:
+            s = int(size[0])
+            return (s, s)
+        else:
+            # Fallback: try to interpret as square size using first element
+            s = int(size[0])
+            return (s, s)
+    # Scalar → square size
+    s = int(size)
+    return (s, s)
+def resize_and_pad(
+    image,
+    desired_output_size,
+    resize_method="torch-bilinear",
+    pad_value=0,
+):
+    """Resize an image while padding to preserve uts aspect ratio."""
+    desired_output_size = _ensure_pyint_size2(desired_output_size)
+    desired_height, desired_width = desired_output_size
+    height, width = image.shape[:2]
+    # Cast into float32 since the training code did this in float32 and it (very rarely) effects
+    # the results after rounding.
+    image_scale_y = np.array(desired_height, np.float32) / np.array(height, np.float32)
+    image_scale_x = np.array(desired_width, np.float32) / np.array(width, np.float32)
+    image_scale = min(image_scale_x, image_scale_y)
+    scaled_height = int(np.array(height, np.float32) * image_scale)
+    scaled_width = int(np.array(width, np.float32) * image_scale)
+    if resize_method in ["torch-bilinear"]:
+        image = torch.permute(torch.from_numpy(image), [2, 0, 1])
+        image = convert_image_dtype(image)  # resize in float32 to match the training code
+        mode = InterpolationMode.BILINEAR
+        image = torchvision.transforms.Resize([scaled_height, scaled_width], mode, antialias=True)(image)
+        image = torch.clip(image, 0.0, 1.0)
+        image = torch.permute(image, [1, 2, 0]).numpy()
+    else:
+        raise NotImplementedError(resize_method)
+    top_pad = (desired_height - scaled_height) // 2
+    left_pad = (desired_width - scaled_width) // 2
+    padding = [
+        [top_pad, desired_height - scaled_height - top_pad],
+        [left_pad, desired_width - scaled_width - left_pad],
+        [0, 0]
+    ]
+    image_mask = np.pad(np.ones_like(image[:, :, 0], dtype=bool), padding[:2])
+    image = np.pad(image, padding, constant_values=pad_value)
+    return image, image_mask
+def metaclip_resize(image, desired_output_size):
+    desired_output_size = _ensure_pyint_size2(desired_output_size)
+    image = torch.permute(torch.from_numpy(image), [2, 0, 1])
+    if torch.is_floating_point(image):
+        image = torchvision.transforms.Resize(
+            desired_output_size, InterpolationMode.BICUBIC, antialias=True)(image)
+        image = torch.clip(image, 0.0, 1.0)
+    else:
+        assert image.dtype == torch.uint8, "Expected float images or uint8 images, but got {}".format(image.dtype)
+        image = torchvision.transforms.Resize(
+            desired_output_size, InterpolationMode.BICUBIC, antialias=True)(image)
+        image = image.to(torch.float32)
+        image = torch.clip(image, 0, 255)
+        image = image / 255.0
+    resized = torch.permute(image, [1, 2, 0]).numpy()
+    image_mask = np.ones_like(resized[:, :, 0], dtype=np.bool_)
+    return resized, image_mask
+def siglip_resize_and_pad(
+    image: np.ndarray,
+    desired_output_size: Tuple[int, int],
+) -> Tuple[np.ndarray, np.ndarray]:
+    desired_output_size = _ensure_pyint_size2(desired_output_size)
+    if len(image.shape) == 3:
+        is_video = False
+        image = torch.permute(torch.from_numpy(image), [2, 0, 1])
+    else:
+        is_video = True
+        image = torch.permute(torch.from_numpy(image), [0, 3, 1, 2])
+    dtype = image.dtype
+    if torch.is_floating_point(image):
+        in_min = 0.0
+        in_max = 1.0
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            InterpolationMode.BILINEAR,
+            antialias=False,
+        )(image)
+        resized = torch.clip(resized, 0.0, 1.0).to(dtype)
+    else:
+        assert image.dtype == torch.uint8, "SigLIP expects float images or uint8 images, but got {}".format(image.dtype)
+        in_min = 0.0
+        in_max = 255.0
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            InterpolationMode.BILINEAR,
+            antialias=False,
+        )(image)
+        resized = torch.clip(resized, 0, 255).to(dtype)
+    resized = resized.to(torch.float32)
+    resized = (resized - in_min) / (in_max - in_min)
+    if is_video:
+        resized = torch.permute(resized, [0, 2, 3, 1]).numpy()
+        image_mask = None
+    else:
+        resized = torch.permute(resized, [1, 2, 0]).numpy()
+        image_mask = np.ones_like(resized[:, :, 0], dtype=np.bool_)
+    return resized, image_mask
+def dino_resize_and_pad(
+    image: np.ndarray,
+    desired_output_size: Tuple[int, int],
+) -> Tuple[np.ndarray, np.ndarray]:
+    desired_output_size = _ensure_pyint_size2(desired_output_size)
+    image = torch.permute(torch.from_numpy(image), [2, 0, 1])
+    dtype = image.dtype
+    if torch.is_floating_point(image):
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            InterpolationMode.BICUBIC,
+            antialias=True,
+        )(image)
+        resized = torch.clip(resized, 0.0, 1.0).to(torch.float32)
+    else:
+        assert image.dtype == torch.uint8, "DINOv2 expects float images or uint8 images, but got {}".format(image.dtype)
+        resized = torchvision.transforms.Resize(
+            desired_output_size,
+            InterpolationMode.BICUBIC,
+            antialias=True,
+        )(image)
+        resized = torch.clip(resized, 0, 255).to(torch.float32)
+        resized = resized / 255.0
+    resized = torch.permute(resized, [1, 2, 0]).numpy()
+    image_mask = np.ones_like(resized[:, :, 0], dtype=np.bool_)
+    return resized, image_mask
+def resize_image(
+    image: np.ndarray,
+    resize_mode: str,
+    output_size: Tuple[int, int],
+    pad_value: float,
+) -> Tuple[np.ndarray, np.ndarray]:
+    if resize_mode == "siglip":
+        return siglip_resize_and_pad(image, output_size)
+    elif resize_mode == "dino":
+        return dino_resize_and_pad(image, output_size)
+    elif resize_mode == "metaclip":
+        return metaclip_resize(image, output_size)
+    else:
+        resize = "torch-bilinear" if resize_mode == "default" else resize_mode
+        return resize_and_pad(
+            image, output_size, resize_method=resize, pad_value=pad_value,
+        )
+def select_tiling(h, w, patch_size, max_num_crops):
+    """Divide in image of size [w, h] in up to max_num_patches of size patch_size"""
+    original_size = np.stack([h, w])  # [1, 2]
+    original_res = h * w
+    tilings = []
+    for i in range(1, max_num_crops + 1):
+        for j in range(1, max_num_crops + 1):
+            if i*j <= max_num_crops:
+                tilings.append((i, j))
+    # sort so argmin and argmax favour smaller tilings in the event of a tie
+    tilings.sort(key=lambda x: (x[0]*x[1], x[0]))
+    candidate_tilings = np.array(tilings, dtype=np.int32)  # [n_resolutions, 2]
+    candidate_resolutions = candidate_tilings * patch_size  # [n_resolutions, 2]
+    # How much we would need to scale the image to fit exactly in each tiling
+    original_size = np.stack([h, w], dtype=np.float32)  # [1, 2]
+    # The original size can be zero in rare cases if the image is smaller than the margin
+    # In those cases letting the scale become infinite means the tiling is based on the
+    # other side, or falls back to the smallest tiling
+    with np.errstate(divide='ignore'):
+        required_scale_d = candidate_resolutions.astype(np.float32) / original_size,
+    required_scale = np.min(required_scale_d, axis=-1, keepdims=True)  # [n_resolutions, 1]
+    if np.all(required_scale < 1):
+        # We are forced to downscale, so try to minimize the amount of downscaling
+        ix = np.argmax(required_scale)
+    else:
+        # Pick the resolution that required the least upscaling so that it most closely fits the image
+        required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
+        ix = np.argmin(required_scale)
+    return candidate_tilings[ix]
+def build_resized_image(
+    image: np.ndarray,
+    resize_mode: str,
+    normalized_mode: str,
+    base_image_input_size: List[int],
+    pad_value: float,
+    image_patch_size: int,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    resized, resized_mask = resize_image(
+        image, resize_mode, base_image_input_size, pad_value,
+    )
+    resized = normalize_image(resized, normalized_mode)
+    if len(resized.shape) == 3:
+        resized = np.expand_dims(resized, 0)
+    resized_mask = np.expand_dims(resized_mask, 0)
+    crop_patch_w = base_image_input_size[1] // image_patch_size
+    crop_patch_h = base_image_input_size[0] // image_patch_size
+    resize_idx = np.arange(crop_patch_w*crop_patch_h).reshape([crop_patch_h, crop_patch_w])
+    return resized, resized_mask, resize_idx
+def build_overlapping_crops(
+    image: np.ndarray,
+    resize_mode: str,
+    normalize_mode: str,
+    max_crops: int,
+    overlap_margins: List[int],
+    base_image_input_size: List[int],
+    pad_value: float,
+    image_patch_size: int,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Decompose an image into a set of overlapping crops
+    :return crop_arr: [n_crops, h, w, 3] The crops
+    :return mask_arr: [n_crops, h, w] The padding masks
+    :return patch_idx: [overlap_patch_h, overlap_patch_w] For each patch in the resized image
+                        the crops were extracted from, what patch in `crop_arr` it corresponds to
+    """
+    original_image_h, original_image_w = image.shape[:2]
+    crop_size = base_image_input_size[0]
+    assert base_image_input_size[0] == base_image_input_size[1]
+    left_margin, right_margin = overlap_margins
+    total_margin_pixels = image_patch_size * (right_margin + left_margin)  # pixels removed per dim
+    crop_patches = base_image_input_size[0] // image_patch_size  # patches per crop dim
+    crop_window_patches = crop_patches - (right_margin + left_margin)  # usable patches
+    crop_window_size = crop_window_patches * image_patch_size
+    crop_patch_w = base_image_input_size[1] // image_patch_size
+    crop_patch_h = base_image_input_size[0] // image_patch_size
+    original_image_h, original_image_w = image.shape[:2]
+    crop_size = base_image_input_size[0]
+    # Decide how to tile the image, to account for the overlap margins we compute the tiling
+    # as if we had an image without the margins and were using a crop size without the margins
+    tiling = select_tiling(
+        original_image_h - total_margin_pixels,
+        original_image_w - total_margin_pixels,
+        crop_window_size,
+        max_crops,
+    )
+    src, img_mask = resize_image(
+        image,
+        resize_mode,
+        [tiling[0]*crop_window_size+total_margin_pixels, tiling[1]*crop_window_size+total_margin_pixels],
+        pad_value,
+    )
+    src = normalize_image(src, normalize_mode)
+    # Now we have to split the image into crops, and track what patches came from
+    # where in `patch_idx_arr`
+    n_crops = tiling[0] * tiling[1]
+    crop_arr = np.zeros([n_crops, crop_size, crop_size, 3], dtype=src.dtype)
+    mask_arr = np.zeros([n_crops, crop_size, crop_size], dtype=img_mask.dtype)
+    patch_idx_arr = np.zeros([n_crops, crop_patch_h, crop_patch_w], dtype=np.int32)
+    on = 0
+    on_crop = 0
+    for i in range(tiling[0]):
+        # Slide over `src` by `crop_window_size` steps, but extract crops of size `crops_size`
+        # which results in overlapping crop windows
+        y0 = i*crop_window_size
+        for j in range(tiling[1]):
+            x0 = j*crop_window_size
+            crop_arr[on_crop] = src[y0:y0+crop_size, x0:x0+crop_size]
+            mask_arr[on_crop] = img_mask[y0:y0+crop_size, x0:x0+crop_size]
+            patch_idx = np.arange(crop_patch_w*crop_patch_h).reshape(crop_patch_h, crop_patch_w)
+            patch_idx += on_crop * crop_patch_h * crop_patch_w
+            # Mask out idx that are in the overlap region
+            if i != 0:
+                patch_idx[:left_margin, :] = -1
+            if j != 0:
+                patch_idx[:, :left_margin] = -1
+            if i != tiling[0]-1:
+                patch_idx[-right_margin:, :] = -1
+            if j != tiling[1]-1:
+                patch_idx[:, -right_margin:] = -1
+            patch_idx_arr[on_crop] = patch_idx
+            on_crop += 1
+    # `patch_idx_arr` is ordered crop-by-crop, here we transpose `patch_idx_arr`
+    # so it is ordered left-to-right order
+    patch_idx_arr = np.reshape(
+        patch_idx_arr,
+        [tiling[0], tiling[1], crop_patch_h, crop_patch_w]
+    )
+    patch_idx_arr = np.transpose(patch_idx_arr, [0, 2, 1, 3])
+    patch_idx_arr = np.reshape(patch_idx_arr, [-1])
+    # Now get the parts not in the overlap region, so it should map each patch in `src`
+    # to the correct patch it should come from in `crop_arr`
+    patch_idx_arr = patch_idx_arr[patch_idx_arr >= 0].reshape(
+        src.shape[0]//image_patch_size,
+        src.shape[1]//image_patch_size,
+    )
+    return crop_arr, mask_arr, patch_idx_arr
+def batch_pixels_to_patches(array: np.ndarray, patch_size: int) -> np.ndarray:
+    """Reshape images of [n_images, h, w, 3] -> [n_images, n_patches, pixels_per_patch]"""
+    if len(array.shape) == 3:
+        n_crops, h, w = array.shape
+        h_patches = h//patch_size
+        w_patches = w//patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size])
+        array = np.transpose(array, [0, 1, 3, 2, 4])
+        array = np.reshape(array, [n_crops, h_patches*w_patches, patch_size*patch_size])
+        return array
+    else:
+        n_crops, h, w, c = array.shape
+        h_patches = h//patch_size
+        w_patches = w//patch_size
+        array = np.reshape(array, [n_crops, h_patches, patch_size, w_patches, patch_size, c])
+        array = np.transpose(array, [0, 1, 3, 2, 4, 5])
+        array = np.reshape(array, [n_crops, h_patches*w_patches, patch_size*patch_size*c])
+        return array
+def arange_for_pooling(
+    idx_arr: np.ndarray,
+    pool_h: int,
+    pool_w: int,
+) -> np.ndarray:
+    h_pad = pool_h * ((idx_arr.shape[0] + pool_h - 1) // pool_h) - idx_arr.shape[0]
+    w_pad = pool_w * ((idx_arr.shape[1] + pool_w - 1) // pool_w) - idx_arr.shape[1]
+    idx_arr = np.pad(idx_arr, [[h_pad//2, (h_pad+1)//2], [w_pad//2, (w_pad+1)//2]],
+                     mode='constant',constant_values=-1)
+    return einops.rearrange(
+        idx_arr, "(h dh) (w dw) -> h w (dh dw)", dh=pool_h, dw=pool_w)
+def image_to_patches_and_grids(
+    image: ImageInput,
+    crop_mode: str,
+    resize_mode: str,
+    normalize_mode: str,
+    max_crops: int,
+    overlap_margins: List[int],
+    base_image_input_size: List[int],
+    pad_value: float,
+    image_patch_size: int,
+    image_pooling_w: int,
+    image_pooling_h: int,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """
+    :return image_grids, the shape of each (low-res, high-res) image after pooling
+    :return crops, the image crops to processes with the ViT
+    :return mask, the padding mask for each crop
+    :return pooled_patch_idx, for each patch_id tokens in `image_tokens`, the indices of the
+                                patches in `crops` to pool for that token, masked with -1
+    """
+    if isinstance(base_image_input_size, int):
+        base_image_input_size = (base_image_input_size, base_image_input_size)
+    base_image_input_d = image_patch_size
+    pooling_w = image_pooling_w
+    pooling_h = image_pooling_h
+    crop_patch_w = base_image_input_size[1] // base_image_input_d
+    crop_patch_h = base_image_input_size[0] // base_image_input_d
+    if crop_mode == "resize":
+        resized, resized_mask, resize_idx = build_resized_image(
+            image,
+            resize_mode,
+            normalize_mode,
+            base_image_input_size,
+            pad_value,
+            image_patch_size
+        )
+        pooling_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
+        h, w = pooling_idx.shape[:2]
+        pooling_idx = pooling_idx.reshape([-1, pooling_h*pooling_w])
+        image_grid = [np.array([h, w])]
+        return (
+            np.stack(image_grid, 0),
+            batch_pixels_to_patches(resized, image_patch_size),
+            batch_pixels_to_patches(resized_mask, image_patch_size).mean(-1),
+            pooling_idx,
+        )
+    if crop_mode in ["overlap-and-resize-c2", "overlap-and-resize"]:
+        crop_arr, mask_arr, patch_idx_arr = build_overlapping_crops(
+            image,
+            resize_mode,
+            normalize_mode,
+            max_crops,
+            overlap_margins,
+            base_image_input_size,
+            pad_value,
+            image_patch_size,
+        )
+        pooling_idx = arange_for_pooling(patch_idx_arr, pooling_h, pooling_w)
+        h, w = pooling_idx.shape[:2]
+        pooling_idx = pooling_idx.reshape([-1, pooling_h*pooling_w])
+        image_grid = [np.array([h, w])]
+        if crop_mode == "overlap-and-resize":
+            crop_arr = batch_pixels_to_patches(crop_arr, image_patch_size)
+            mask_arr = batch_pixels_to_patches(mask_arr, image_patch_size).astype(np.float32).mean(axis=-1)
+            return np.stack(image_grid, 0), crop_arr, mask_arr, pooling_idx
+        # Finally do the same for the global image
+        resized, resized_mask, resize_idx = build_resized_image(
+            image,
+            resize_mode,
+            normalize_mode,
+            base_image_input_size,
+            pad_value,
+            image_patch_size
+        )
+        crop_arr = np.concatenate([resized, crop_arr], 0)
+        mask_arr = np.concatenate([resized_mask, mask_arr], 0)
+        resize_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
+        h, w = resize_idx.shape[:2]
+        resize_idx = resize_idx.reshape([-1, pooling_h*pooling_w])
+        # Global image goes first, so the order of patches in previous crops gets increased
+        pooling_idx = np.where(
+            pooling_idx >= 0,
+            pooling_idx + crop_patch_h*crop_patch_w,
+            -1
+        )
+        pooling_idx = np.concatenate([resize_idx, pooling_idx])
+        image_grid = [
+            np.array([h, w]),
+        ] + image_grid
+        mask_arr = batch_pixels_to_patches(mask_arr, image_patch_size).astype(np.float32).mean(axis=-1)
+        return (
+            np.stack(image_grid, 0),
+            batch_pixels_to_patches(crop_arr, image_patch_size),
+            mask_arr,
+            pooling_idx
+        )
+    else:
+        raise NotImplementedError(crop_mode)
+def image_to_patches_and_tokens(
+    image: ImageInput,
+    crop_mode: str,
+    use_col_tokens: bool,
+    resize_mode: str,
+    normalize_mode: str,
+    max_crops: int,
+    overlap_margins: List[int],
+    base_image_input_size: List[int],
+    pad_value: float,
+    image_patch_size: int,
+    image_pooling_w: int,
+    image_pooling_h: int,
+    image_patch_token_id: int,
+    image_col_token_id: int,
+    image_start_token_id: int,
+    image_end_token_id: int,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """
+    :return image_tokens, the token IDS for this image, including special tokens
+    :return crops, the image crops to processes with the ViT
+    :return mask, the padding mask for each crop
+    :return pooled_patch_idx, for each patch_id tokens in `image_tokens`, the indices of the
+                                patches in `crops` to pool for that token, masked with -1
+    """
+    if isinstance(base_image_input_size, int):
+        base_image_input_size = (base_image_input_size, base_image_input_size)
+    base_image_input_d = image_patch_size
+    pooling_w = image_pooling_w
+    pooling_h = image_pooling_h
+    patch_id = image_patch_token_id
+    col_id = image_col_token_id
+    start_id = image_start_token_id
+    end_id = image_end_token_id
+    crop_patch_w = base_image_input_size[1] // base_image_input_d
+    crop_patch_h = base_image_input_size[0] // base_image_input_d
+    if crop_mode == "resize":
+        resized, resized_mask, resize_idx = build_resized_image(
+            image,
+            resize_mode,
+            normalize_mode,
+            base_image_input_size,
+            pad_value,
+            image_patch_size
+        )
+        pooling_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
+        h, w = pooling_idx.shape[:2]
+        pooling_idx = pooling_idx.reshape([-1, pooling_h*pooling_w])
+        per_row = np.full(
+            (w,),
+            patch_id,
+            dtype=np.int32
+        )
+        if use_col_tokens:
+            per_row = np.concatenate([per_row, [col_id]], 0)
+        extra_tokens = np.tile(per_row, [h])
+        joint = [
+            [start_id],
+            extra_tokens,
+            [end_id],
+        ]
+        return (
+            np.concatenate(joint, 0),
+            batch_pixels_to_patches(resized, image_patch_size),
+            batch_pixels_to_patches(resized_mask, image_patch_size).mean(-1),
+            pooling_idx,
+        )
+    if crop_mode in ["overlap-and-resize-c2", "overlap-and-resize"]:
+        crop_arr, mask_arr, patch_idx_arr = build_overlapping_crops(
+            image,
+            resize_mode,
+            normalize_mode,
+            max_crops,
+            overlap_margins,
+            base_image_input_size,
+            pad_value,
+            image_patch_size,
+        )
+        pooling_idx = arange_for_pooling(patch_idx_arr, pooling_h, pooling_w)
+        h, w = pooling_idx.shape[:2]
+        pooling_idx = pooling_idx.reshape([-1, pooling_h*pooling_w])
+        # Now build the output tokens
+        per_row = np.full(w, patch_id, dtype=np.int32)
+        if use_col_tokens:
+            per_row = np.concatenate([per_row, [col_id]], 0)
+        joint = np.tile(per_row, [h])
+        joint = [
+            [start_id],
+            joint,
+            [end_id]
+        ]
+        if crop_mode == "overlap-and-resize":
+            crop_arr = batch_pixels_to_patches(crop_arr, image_patch_size)
+            mask_arr = batch_pixels_to_patches(mask_arr, image_patch_size).astype(np.float32).mean(axis=-1)
+            return np.concatenate(joint, 0), crop_arr, mask_arr, pooling_idx
+        # Finally do the same for the global image
+        resized, resized_mask, resize_idx = build_resized_image(
+            image,
+            resize_mode,
+            normalize_mode,
+            base_image_input_size,
+            pad_value,
+            image_patch_size
+        )
+        crop_arr = np.concatenate([resized, crop_arr], 0)
+        mask_arr = np.concatenate([resized_mask, mask_arr], 0)
+        resize_idx = arange_for_pooling(resize_idx, pooling_h, pooling_w)
+        h, w = resize_idx.shape[:2]
+        resize_idx = resize_idx.reshape([-1, pooling_h*pooling_w])
+        # Global image goes first, so the order of patches in previous crops gets increased
+        pooling_idx = np.where(
+            pooling_idx >= 0,
+            pooling_idx + crop_patch_h*crop_patch_w,
+            -1
+        )
+        pooling_idx = np.concatenate([resize_idx, pooling_idx])
+        per_row = np.full(
+            (w,),
+            patch_id,
+            dtype=np.int32
+        )
+        if use_col_tokens:
+            per_row = np.concatenate([per_row, [col_id]], 0)
+        extra_tokens = np.tile(per_row, [h])
+        joint = [
+            [start_id],
+            extra_tokens,
+            [end_id],
+        ] + joint
+        mask_arr = batch_pixels_to_patches(mask_arr, image_patch_size).astype(np.float32).mean(axis=-1)
+        return (
+            np.concatenate(joint, 0),
+            batch_pixels_to_patches(crop_arr, image_patch_size),
+            mask_arr,
+            pooling_idx
+        )
+    else:
+        raise NotImplementedError(crop_mode)
+class MolmoActImagesKwargs(ImagesKwargs, total=False):
+    crop_mode: Optional[str]
+    resize_mode: Optional[str]
+    normalize_mode: Optional[str]
+    max_crops: Optional[int]
+    max_multi_image_crops: Optional[int]
+    overlap_margins: Optional[List[int]]
+    base_image_input_size: Optional[List[int]]
+    pad_value: Optional[float]
+    image_patch_size: Optional[int]
+    image_pooling_w: Optional[int]
+    image_pooling_h: Optional[int]
+class MolmoActImageProcessor(BaseImageProcessor):
+    model_input_names = ["images", "pooled_patches_idx", "image_masks"]
+    def __init__(
+        self,
+        crop_mode: str = "overlap-and-resize-c2",
+        resize_mode: str = "siglip",
+        normalize_mode: str = "siglip",
+        max_crops: int = 8,
+        max_multi_image_crops: int = 4,
+        overlap_margins: List[int] = [4, 4],
+        base_image_input_size: List[int] = (378, 378),
+        pad_value: float = 0.0,
+        image_patch_size: int = 14,
+        image_pooling_w: int = 2,
+        image_pooling_h: int = 2,
+        do_convert_rgb: bool = True,
+        do_pad: Optional[bool] = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.crop_mode = crop_mode
+        self.resize_mode = resize_mode
+        self.normalize_mode = normalize_mode
+        self.overlap_margins = overlap_margins
+        self.max_crops = max_crops
+        self.max_multi_image_crops = max_multi_image_crops
+        self.overlap_margins = overlap_margins
+        self.base_image_input_size = base_image_input_size
+        self.pad_value = pad_value
+        self.image_patch_size = image_patch_size
+        self.image_pooling_w = image_pooling_w
+        self.image_pooling_h = image_pooling_h
+        self.do_convert_rgb = do_convert_rgb
+        self.do_pad = do_pad
+    def to_channel_dimension_last(
+        self,
+        images: List[ImageInput],
+    ) -> List[ImageInput]:
+        """
+        Convert images to channel dimension last.
+        """
+        new_images = []
+        for image in images:
+            if is_multi_image(image):
+                new_images.append([to_channel_dimension_format(img, ChannelDimension.LAST) for img in image])
+            else:
+                new_images.append(to_channel_dimension_format(image, ChannelDimension.LAST))
+        return new_images
+    def to_numpy_array(
+        self,
+        images: List[ImageInput],
+    ) -> List[np.ndarray]:
+        """
+        Convert images to numpy array.
+        """
+        new_images = []
+        for image in images:
+            if is_multi_image(image):
+                new_images.append([to_numpy_array(img) for img in image])
+            else:
+                new_images.append(to_numpy_array(image))
+        return new_images
+    def to_rgb(
+        self,
+        images: List[ImageInput],
+    ) -> List[ImageInput]:
+        """
+        Convert images to RGB.
+        """
+        new_images = []
+        for image in images:
+            if is_multi_image(image):
+                new_images.append([convert_to_rgb(img) for img in image])
+            else:
+                new_images.append(convert_to_rgb(image))
+        return new_images
+    def pad_arrays(self, arrays: List[np.ndarray], pad_value: float = -1) -> np.ndarray:
+        max_len = max(arr.shape[0] for arr in arrays)
+        padded_arr = np.full(
+            [len(arrays), max_len] + list(arrays[0].shape[1:]), pad_value, dtype=arrays[0].dtype
+        )
+        for ix, arr in enumerate(arrays):
+            padded_arr[ix, :len(arr)] = arr[:max_len]
+        return padded_arr
+    def pad_for_batching(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Pad the data for batching.
+        """
+        images = self.pad_arrays(data["images"])
+        pooled_patches_idx = self.pad_arrays(data["pooled_patches_idx"])
+        image_masks = self.pad_arrays(data["image_masks"])
+        image_grids = self.pad_arrays(data["image_grids"])
+        new_data = dict(
+            images=images,
+            pooled_patches_idx=pooled_patches_idx,
+            image_masks=image_masks,
+            image_grids=image_grids,
+        )
+        return new_data
+    def preprocess(
+        self,
+        images: Union[ImageInput, List[ImageInput]],
+        crop_mode: Optional[str] = None,
+        resize_mode: Optional[str] = None,
+        normalize_mode: Optional[str] = None,
+        max_crops: Optional[int] = None,
+        max_multi_image_crops: Optional[int] = None,
+        overlap_margins: Optional[List[int]] = None,
+        base_image_input_size: Optional[List[int]] = None,
+        pad_value: Optional[float] = None,
+        image_patch_size: Optional[int] = None,
+        image_pooling_w: Optional[int] = None,
+        image_pooling_h: Optional[int] = None,
+        do_convert_rgb: Optional[bool] = None,
+        do_pad: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image for the model.
+        Args:
+            image: The image to preprocess.
+            crop_mode: The crop mode to use. If None, use the default crop mode.
+            resize_mode: The resize mode to use. If None, use the default resize mode.
+            normalize_mode: The normalization mode to use. If None, use the default normalization mode.
+            max_crops: The maximum number of crops to use. If None, use the default value.
+            max_multi_image_crops: The maximum number of crops to use for multi-image inputs.
+            overlap_margins: The overlap margins to use. If None, use the default values.
+            base_image_input_size: The base image input size to use. If None, use the default size.
+            pad_value: The padding value to use. If None, use the default value.
+            image_patch_size: The size of the image patches. If None, use the default size.
+            image_pooling_h: The height of the image pooling. If None, use the default height.
+            image_pooling_w: The width of the image pooling. If None, use the default width.
+            do_convert_rgb: Whether to convert the image to RGB. If None, use the default value.
+            do_pad: Whether to pad image features. If None, use the default value.
+        Returns:
+            A tuple containing:
+                - The image grids
+                - The preprocessed images
+                - The padding masks
+                - The pooling indices
+        """
+        images = make_batched_images(images)
+        if not valid_images(images):
+            raise ValueError("Invalid image input")
+        crop_mode = crop_mode or self.crop_mode
+        normalize_mode = normalize_mode or self.normalize_mode
+        resize_mode = resize_mode or self.resize_mode
+        max_crops = max_crops or self.max_crops
+        max_multi_image_crops = max_multi_image_crops or self.max_multi_image_crops
+        overlap_margins = overlap_margins or self.overlap_margins
+        base_image_input_size = base_image_input_size or self.base_image_input_size
+        pad_value = pad_value or self.pad_value
+        image_patch_size = image_patch_size or self.image_patch_size
+        image_pooling_w = image_pooling_w or self.image_pooling_w
+        image_pooling_h = image_pooling_h or self.image_pooling_h
+        do_convert_rgb = do_convert_rgb or self.do_convert_rgb
+        do_pad = do_pad or self.do_pad
+        if do_convert_rgb:
+            images = self.to_rgb(images)
+        # All transformations expect numpy arrays.
+        images = self.to_numpy_array(images)
+        # All transformations expect channel dimension last.
+        images = self.to_channel_dimension_last(images)
+        batch_image_grids = []
+        batch_crops = []
+        batch_crop_masks = []
+        batch_pooled_patches_idx = []
+        for image in images:
+            if is_multi_image(image):
+                all_image_grids = []
+                all_crops = []
+                all_crop_masks = []
+                pooled_patches_idx = []
+                for img in image:
+                    image_grid, crops, img_mask, pooled_idx = image_to_patches_and_grids(
+                        img,
+                        crop_mode,
+                        resize_mode,
+                        normalize_mode,
+                        max_multi_image_crops,
+                        overlap_margins,
+                        base_image_input_size,
+                        pad_value,
+                        image_patch_size,
+                        image_pooling_w,
+                        image_pooling_h,
+                    )
+                    pooled_patches_idx.append(pooled_idx + sum(np.prod(x.shape[:2]) for x in all_crops))
+                    all_crops.append(crops)
+                    all_crop_masks.append(img_mask)
+                    all_image_grids.append(image_grid)
+                all_image_grids = np.concatenate(all_image_grids, 0)
+                all_crops = np.concatenate(all_crops, 0)
+                all_crop_masks = np.concatenate(all_crop_masks, 0)
+                pooled_patches_idx = np.concatenate(pooled_patches_idx, 0)
+                batch_image_grids.append(all_image_grids)
+                batch_crops.append(all_crops)
+                batch_crop_masks.append(all_crop_masks)
+                batch_pooled_patches_idx.append(pooled_patches_idx)
+            else:
+                image_grid, crops, img_mask, pooled_idx = image_to_patches_and_grids(
+                    image,
+                    crop_mode,
+                    resize_mode,
+                    normalize_mode,
+                    max_crops,
+                    overlap_margins,
+                    base_image_input_size,
+                    pad_value,
+                    image_patch_size,
+                    image_pooling_w,
+                    image_pooling_h,
+                )
+                batch_image_grids.append(image_grid)
+                batch_crops.append(crops)
+                batch_crop_masks.append(img_mask)
+                batch_pooled_patches_idx.append(pooled_idx)
+        data =dict(
+            images=batch_crops,
+            pooled_patches_idx=batch_pooled_patches_idx,
+            image_masks=batch_crop_masks,
+            image_grids=batch_image_grids,
+        )
+        if do_pad:
+            data = self.pad_for_batching(data)
+        return BatchFeature(data, tensor_type=return_tensors)
+MolmoActImageProcessor.register_for_auto_class()

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc6cee860646e6245edd505fef7896a8ebc5fd47d6e260852699ffcb351d4119
+size 4975847688

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3b707bb662766ec4202e82fa9245ffefc1f9ec40e4aebe011426ea2459e463b
+size 4890972104

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a772516b69576965758f7683d47a10ef81916bcf7c52e10836329e14524a6ff3
+size 4620250280

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65446424dc3ae3bfe8675c254936da139406793f5329b1db40466a6f9ba02b17
+size 826278016

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,636 @@

+{
+  "metadata": {
+    "total_size": 15313260544
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.transformer.blocks.0.attn_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.0.ff_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.0.mlp.ff_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.0.mlp.ff_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.0.self_attn.att_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.0.self_attn.attn_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.1.attn_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.1.ff_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.1.mlp.ff_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.1.mlp.ff_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.1.self_attn.att_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.1.self_attn.attn_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.10.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.10.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.10.mlp.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.10.mlp.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.10.self_attn.att_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.10.self_attn.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.10.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.10.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.11.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.11.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.11.mlp.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.11.mlp.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.11.self_attn.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.11.self_attn.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.11.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.11.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.12.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.12.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.12.mlp.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.12.mlp.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.12.self_attn.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.12.self_attn.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.12.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.13.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.13.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.13.mlp.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.13.mlp.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.13.self_attn.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.13.self_attn.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.14.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.14.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.14.mlp.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.14.mlp.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.14.self_attn.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.14.self_attn.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.15.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.15.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.15.mlp.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.15.mlp.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.15.self_attn.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.15.self_attn.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.16.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.16.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.16.mlp.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.16.mlp.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.16.self_attn.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.16.self_attn.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.17.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.17.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.17.mlp.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.17.mlp.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.17.self_attn.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.17.self_attn.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.18.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.18.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.18.mlp.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.18.mlp.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.18.self_attn.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.18.self_attn.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.18.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.18.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.19.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.19.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.19.mlp.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.19.mlp.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.19.self_attn.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.19.self_attn.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.19.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.19.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.2.attn_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.2.ff_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.2.mlp.ff_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.2.mlp.ff_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.2.self_attn.att_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.2.self_attn.attn_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.20.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.20.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.20.mlp.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.20.mlp.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.20.self_attn.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.20.self_attn.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.20.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.20.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.21.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.21.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.21.mlp.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.21.mlp.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.21.self_attn.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.21.self_attn.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.21.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.21.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.22.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.22.ff_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.22.mlp.ff_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.22.mlp.ff_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.22.self_attn.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.22.self_attn.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.22.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.22.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.23.attn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.23.ff_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.23.mlp.ff_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.23.mlp.ff_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.23.self_attn.att_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.23.self_attn.attn_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.23.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.23.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.24.attn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.24.ff_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.24.mlp.ff_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.24.mlp.ff_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.24.self_attn.att_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.24.self_attn.attn_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.25.attn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.25.ff_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.25.mlp.ff_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.25.mlp.ff_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.25.self_attn.att_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.25.self_attn.attn_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.26.attn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.26.ff_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.26.mlp.ff_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.26.mlp.ff_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.26.self_attn.att_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.26.self_attn.attn_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.27.attn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.27.ff_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.27.mlp.ff_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.27.mlp.ff_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.27.self_attn.att_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.27.self_attn.attn_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.27.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.27.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.28.attn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.28.ff_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.28.mlp.ff_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.28.mlp.ff_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.28.self_attn.att_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.28.self_attn.attn_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.28.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.28.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.29.attn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.29.ff_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.29.mlp.ff_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.29.mlp.ff_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.29.self_attn.att_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.29.self_attn.attn_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.29.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.29.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.3.attn_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.3.ff_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.3.mlp.ff_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.3.mlp.ff_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.3.self_attn.att_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.3.self_attn.attn_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.30.attn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.30.ff_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.30.mlp.ff_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.30.mlp.ff_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.30.self_attn.att_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.30.self_attn.attn_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.30.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.30.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.31.attn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.31.ff_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.31.mlp.ff_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.31.mlp.ff_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.31.self_attn.att_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.31.self_attn.attn_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.31.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.4.attn_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.4.ff_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.4.mlp.ff_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.4.mlp.ff_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.4.self_attn.att_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.4.self_attn.attn_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.5.attn_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.5.ff_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.5.mlp.ff_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.5.mlp.ff_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.5.self_attn.att_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.5.self_attn.attn_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.6.attn_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.6.ff_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.6.mlp.ff_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.6.mlp.ff_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.6.self_attn.att_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.6.self_attn.attn_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.7.attn_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.7.ff_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.7.mlp.ff_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.7.mlp.ff_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.7.self_attn.att_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.7.self_attn.attn_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.7.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.7.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.8.attn_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.8.ff_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.8.mlp.ff_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.8.mlp.ff_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.8.self_attn.att_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.8.self_attn.attn_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.8.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.8.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.9.attn_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.9.ff_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.9.mlp.ff_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.9.mlp.ff_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.9.self_attn.att_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.9.self_attn.attn_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.9.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.9.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.ln_f.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.wte.embedding": "model-00001-of-00004.safetensors",
+    "model.transformer.wte.new_embedding": "model-00001-of-00004.safetensors",
+    "model.vision_backbone.image_pooling_2d.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_pooling_2d.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_pooling_2d.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_pooling_2d.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_pooling_2d.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_pooling_2d.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_pooling_2d.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_pooling_2d.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_projector.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_projector.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_projector.w3.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.class_embedding": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.patch_embedding.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.positional_embedding": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.pre_ln.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.pre_ln.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wk.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wk.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wo.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wo.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wq.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wq.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.bias": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_backbone.pad_embed": "model-00003-of-00004.safetensors"
+  }
+}

modeling_molmoact.py ADDED Viewed

	@@ -0,0 +1,2100 @@

+import math
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union, Dict, Any, Sequence, Callable
+import torch
+from torch import nn
+from torch.nn import functional as F
+from transformers.models.auto import AutoModelForCausalLM, AutoModelForImageTextToText
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.generation.configuration_utils import GenerationConfig
+from transformers.generation.utils import GenerateOutput
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_flash_attention_utils import _flash_attention_forward, FlashAttentionKwargs
+from transformers import GradientCheckpointingLayer
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPast,
+    BaseModelOutputWithPooling,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import (
+    ModelOutput,
+    can_return_tuple,
+    is_torch_flex_attn_available,
+    logging,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from .configuration_molmoact import MolmoActConfig, MolmoActVitConfig, MolmoActAdapterConfig, MolmoActLlmConfig
+import re
+import numpy as np
+from transformers import Qwen2Tokenizer
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+    from transformers.integrations.flex_attention import make_flex_block_causal_mask
+logger = logging.get_logger(__name__)
+MOLMO_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`MolmoActConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+NUM_RE = re.compile(r'[+-]?(?:\d+(?:\.\d+)?|\.\d+)(?:[eE][+-]?\d+)?$')
+DEPTH_RE = re.compile(r'<DEPTH_START>(.*?)<DEPTH_END>', re.DOTALL)
+# One-level-nested [...] matcher: outer block that may contain inner [ ... ] lists
+OUTER_BLOCK_RE = re.compile(r'\[(?:[^\[\]]|\[[^\[\]]*\])+\]')
+def _is_number(s: str) -> bool:
+    return bool(NUM_RE.match(s))
+def _has_non_ascii(s: str) -> bool:
+    return any(ord(ch) > 127 for ch in s)
+def _to_number(s: str):
+    """Parse string number to int when possible, else float."""
+    v = float(s)
+    return int(v) if v.is_integer() else v
+def extract_depth_string(text: str, include_tags: bool = False) -> list[str]:
+    """
+    Return all occurrences of depth strings.
+    If include_tags=True, each item is '<DEPTH_START>...<DEPTH_END>';
+    otherwise each item is just the inner '...'.
+    """
+    matches = list(DEPTH_RE.finditer(text))
+    if include_tags:
+        return [m.group(0) for m in matches]
+    return [m.group(1) for m in matches]
+def extract_trace_lists(
+    text: str,
+    point_len: int | None = 2,     # e.g., 2 for [x,y], 3 for [x,y,z]; None = any length ≥1
+    min_points: int = 1
+) -> list[list[list[float]]]:
+    """
+    Extract *numeric* lists-of-lists like [[140,225],[130,212],...].
+    Returns a list of traces; each trace is a list of points (lists of numbers).
+    Heuristic:
+      - Find outer [ ... ] blocks that may contain inner lists
+      - Keep blocks where every inner list is fully numeric
+      - Enforce per-point length (point_len) and a minimum number of points (min_points)
+    """
+    traces: list[list[list[float]]] = []
+    # Find outer blocks that can contain nested lists
+    for block in OUTER_BLOCK_RE.findall(text):
+        inner_strs = re.findall(r'\[([^\[\]]+)\]', block)  # contents of each inner [...]
+        if len(inner_strs) < min_points:
+            continue
+        rows: list[list[float]] = []
+        ok = True
+        for row in inner_strs:
+            parts = [p.strip().strip('"').strip("'") for p in row.split(',')]
+            if point_len is not None and len(parts) != point_len:
+                ok = False
+                break
+            if not all(_is_number(p) for p in parts):
+                ok = False
+                break
+            rows.append([_to_number(p) for p in parts])
+        if ok:
+            traces.append(rows)
+    return traces
+def extract_action_token_lists(
+    text: str,
+    only_len: int | None = None,         # e.g., 7 if you expect 7-D actions
+    require_non_ascii: bool = True       # set False if your tokens can be pure ASCII
+) -> list[list[str]]:
+    """
+    Extract all [ ... ] groups split by commas, discard numeric lists,
+    and return token lists (quotes stripped, whitespace trimmed).
+    """
+    lists = []
+    # Match NON-nested bracketed groups: [ ... ] without inner [ or ]
+    for inner in re.findall(r'\[([^\[\]]+)\]', text):
+        parts = [p.strip().strip('"').strip("'") for p in inner.split(',')]
+        if only_len is not None and len(parts) != only_len:
+            continue
+        # If *all* items are numeric -> not action tokens (like coordinates)
+        if all(_is_number(p) for p in parts):
+            continue
+        # Optionally require at least one non-ASCII char across tokens (helps exclude plain words/numbers)
+        if require_non_ascii and not any(_has_non_ascii(p) for p in parts):
+            continue
+        lists.append(parts)
+    return lists
+@dataclass
+class MolmoActCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for MolmoAct causal language model (or autoregressive) outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`torch.FloatTensor`, *optional*):
+            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+@dataclass
+class MolmoActModelOutputWithPast(BaseModelOutputWithPast):
+    """
+    Base class for MolmoAct outputs, with hidden states and attentions.
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`torch.FloatTensor`, *optional*):
+            A `torch.FloatTensor` of size `(batch_num_patches, hidden_size)`.
+            image_hidden_states of the model produced by the vision backbone
+    """
+    image_hidden_states: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+class MolmoActPreTrainedModel(PreTrainedModel):
+    config_class = MolmoActLlmConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MolmoActDecoderLayer", "MolmoActPostNormDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = False
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear,)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, MolmoActEmbedding):
+            module.embedding.data.normal_(mean=0.0, std=std)
+            module.new_embedding.data.normal_(mean=0.0, std=std)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, MolmoActRMSNorm):
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            if module.bias is not None:
+                module.bias.data.zero_()
+class ViTMLP(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int, hidden_act: str, device: Union[str, torch.device] = None):
+        super().__init__()
+        self.w1 = nn.Linear(dim, hidden_dim, bias=True, device=device)
+        self.act = ACT2FN[hidden_act]
+        self.w2 = nn.Linear(hidden_dim, dim, bias=True, device=device)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(self.act(self.w1(x)))
+class ViTMultiHeadDotProductAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_key_value_heads: int,
+        head_dim: int,
+        use_bias: bool = True,
+        input_dim: Optional[int] = None,
+        float32_attention: bool = True,
+        attention_dropout: float = 0.0,
+        residual_dropout: float = 0.0,
+        device: Union[str, torch.device] = None,
+        attn_implementation: str = "eager",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.attn_implementation = attn_implementation
+        self.is_causal = False
+        input_dim = input_dim or hidden_size
+        self.wq = nn.Linear(
+            input_dim,
+            self.num_heads * self.head_dim,
+            bias=use_bias,
+            device=device,
+        )
+        self.wk = nn.Linear(
+            input_dim,
+            self.num_key_value_heads * self.head_dim,
+            bias=use_bias,
+            device=device,
+        )
+        self.wv = nn.Linear(
+            input_dim,
+            self.num_key_value_heads * self.head_dim,
+            bias=use_bias,
+            device=device,
+        )
+        self.wo = nn.Linear(
+            self.num_heads * self.head_dim,
+            self.hidden_size,
+        )
+        self.float32_attention = float32_attention
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = nn.Dropout(residual_dropout)
+    def _split_heads(self, hidden_states, num_heads) -> torch.Tensor:
+        return hidden_states.reshape(hidden_states.shape[:2] + (num_heads, self.head_dim))
+    def _merge_heads(self, hidden_states) -> torch.Tensor:
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.hidden_size,))
+    def forward(
+        self,
+        inputs_q: torch.Tensor,
+        inputs_kv: Optional[torch.Tensor] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if inputs_kv is not None:
+            inputs_k = inputs_kv
+            inputs_v = inputs_kv
+        else:
+            inputs_k = inputs_q
+            inputs_v = inputs_q
+        xq, xk, xv = self.wq(inputs_q), self.wk(inputs_k), self.wv(inputs_v)
+        xq = self._split_heads(xq, self.num_heads)
+        xk = self._split_heads(xk, self.num_key_value_heads)
+        xv = self._split_heads(xv, self.num_key_value_heads)
+        if self.num_heads != self.num_key_value_heads:
+            xk = xk.repeat_interleave(self.num_key_value_groups, dim=2, output_size=self.num_heads)
+            xv = xv.repeat_interleave(self.num_key_value_groups, dim=2, output_size=self.num_heads)
+        og_dtype = xq.dtype
+        if self.float32_attention:
+            xq = xq.to(torch.float)
+            xk = xk.to(torch.float)
+        dropout_p = 0.0 if not self.training else self.attention_dropout
+        if self.attn_implementation == "eager":
+            attn_weights = torch.einsum("...qhd,...khd->...hqk", xq / math.sqrt(xq.size(-1)), xk)
+            attn_weights = F.softmax(attn_weights, dim=-1)
+            attn_weights = F.dropout(
+                attn_weights,
+                p=dropout_p,
+                training=self.training
+            )
+            attn_output = torch.einsum("...hqk,...khd->...qhd", attn_weights.to(xv.dtype), xv)
+        elif self.attn_implementation == "sdpa":
+            if not torch.is_autocast_enabled():
+                xv = xv.to(torch.float)
+            attn_output = F.scaled_dot_product_attention(
+                xq.transpose(1, 2).contiguous(),
+                xk.transpose(1, 2).contiguous(),
+                xv.transpose(1, 2).contiguous(),
+                attn_mask=attn_mask,
+                is_causal=False,
+                dropout_p=dropout_p,
+            ).transpose(1, 2)
+        elif self.attn_implementation == "flash_attention_2":
+            assert not self.config.float32_attention
+            # Downcast in case we are running with fp32 hidden states
+            attn_output = _flash_attention_forward(
+                xq.transpose(1, 2).to(torch.bfloat16),
+                xk.transpose(1, 2).to(torch.bfloat16),
+                xv.transpose(1, 2).to(torch.bfloat16),
+                attention_mask=None,
+                query_length=inputs_q.shape[1],
+                is_causal=False,
+                dropout=dropout_p,
+            )
+        else:
+            raise ValueError(f"Attention implementation {self.attn_implementation} not supported")
+        attn_output = attn_output.to(og_dtype)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.wo(attn_output)
+        attn_output = self.residual_dropout(attn_output)
+        return attn_output
+class MolmoActVisionBlock(nn.Module):
+    def __init__(self, config: MolmoActVitConfig, device: Union[str, torch.device] = None):
+        super().__init__()
+        self.attention = ViTMultiHeadDotProductAttention(
+            hidden_size=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_key_value_heads=config.num_key_value_heads,
+            head_dim=config.head_dim,
+            float32_attention=config.float32_attention,
+            attention_dropout=config.attention_dropout,
+            residual_dropout=config.residual_dropout,
+            device=device,
+            attn_implementation=config._attn_implementation,
+        )
+        self.feed_forward = ViTMLP(config.hidden_size, config.intermediate_size, config.hidden_act, device=device)
+        self.attention_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, device=device)
+        self.ffn_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, device=device)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attention(self.attention_norm(x))
+        x = x + self.feed_forward(self.ffn_norm(x))
+        return x
+class MolmoActVisionBlockCollection(nn.Module):
+    def __init__(self, config: MolmoActVitConfig, device: Union[str, torch.device] = None):
+        super().__init__()
+        self.conifg = config
+        self.resblocks = nn.ModuleList([
+            MolmoActVisionBlock(config, device) for _ in range(config.num_hidden_layers)
+        ])
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        hidden_states = []
+        for r in self.resblocks:
+            x = r(x)
+            hidden_states.append(x)
+        return hidden_states
+def _expand_token(token, batch_size: int):
+    return token.view(1, 1, -1).expand(batch_size, -1, -1)
+class MolmoActVisionTransformer(nn.Module):
+    def __init__(self, config: MolmoActVitConfig, device: Union[str, torch.device] = None):
+        super().__init__()
+        self.config = config
+        self.scale = config.hidden_size ** -0.5
+        # optional CLS
+        self.num_prefix_tokens: int = 1 if config.use_cls_token else 0
+        if config.use_cls_token:
+            self.class_embedding = nn.Parameter(
+                torch.zeros(config.hidden_size, device=device)
+            )
+        # positional embeddings
+        self.positional_embedding = nn.Parameter(
+            torch.zeros(config.image_num_pos, config.hidden_size, device=device),
+        )
+        image_patch_size = config.image_patch_size
+        self.patch_embedding = nn.Linear(
+            image_patch_size * image_patch_size * 3,
+            config.hidden_size,
+            bias=config.patch_bias,
+            device=device,
+        )
+        # optional pre-LN
+        self.pre_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, device=device) \
+                      if config.pre_layernorm else None
+        self.transformer = MolmoActVisionBlockCollection(config, device)
+    def add_pos_emb(self, x: torch.Tensor, patch_num: int) -> torch.Tensor:
+        pos_emb = self.positional_embedding
+        if self.config.use_cls_token:
+            cls_pos, pos_emb = pos_emb[:1], pos_emb[1:]   # split out CLS
+        pos_emb = pos_emb.reshape(
+            (int(math.sqrt(pos_emb.shape[0])), int(math.sqrt(pos_emb.shape[0])), pos_emb.shape[1])
+        )
+        (patch_num_0, patch_num_1) = patch_num
+        if pos_emb.shape[0] != patch_num_0 or pos_emb.shape[1] != patch_num_1:
+            # Dervied from https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+            # antialias: default True in jax.image.resize
+            pos_emb = pos_emb.unsqueeze(0).permute(0, 3, 1, 2)
+            pos_emb = F.interpolate(
+                pos_emb, size=(patch_num_0, patch_num_1), mode="bicubic", align_corners=False, antialias=True,
+            )
+            pos_emb = pos_emb.permute(0, 2, 3, 1).squeeze(0)
+        pos_emb = pos_emb.reshape(-1, pos_emb.shape[-1])
+        if self.config.use_cls_token:
+            x = x + torch.cat([cls_pos[None, :, :], pos_emb[None, :, :]], dim=1).to(x.dtype)
+        else:
+            x = x + pos_emb[None, :, :].to(x.dtype)
+        return x
+    def forward(self, x: torch.Tensor, patch_num: int = None) -> List[torch.Tensor]:
+        """
+        : param x: (batch_size, num_patch, n_pixels)
+        """
+        if patch_num is None:
+            patch_num = self.config.image_num_patch
+        B, N, D = x.shape
+        x = self.patch_embedding(x)
+        if self.config.use_cls_token:
+            x = torch.cat([_expand_token(self.class_embedding, x.size(0)).to(x.dtype), x], dim=1)
+        # class embeddings and positional embeddings
+        x = self.add_pos_emb(x, patch_num)
+        if self.pre_ln is not None:
+            x = self.pre_ln(x)
+        hidden_states = self.transformer(x)
+        return hidden_states
+class ImageProjectorMLP(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        hidden_act: str,
+        device: Union[str, torch.device] = None,
+    ):
+        super().__init__()
+        self.w1 = nn.Linear(input_dim, hidden_dim, bias=False, device=device)
+        self.w2 = nn.Linear(hidden_dim, output_dim, bias=False, device=device)
+        self.w3 = nn.Linear(input_dim, hidden_dim, bias=False, device=device)
+        self.act = ACT2FN[hidden_act]
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(self.act(self.w1(x)) * self.w3(x))
+class MolmoActVisionBackbone(nn.Module):
+    def __init__(self, vit_config: MolmoActVitConfig, adapter_config: MolmoActAdapterConfig):
+        super().__init__()
+        self.vit_config = vit_config
+        self.adapter_config = adapter_config
+        self.vit_layers = []
+        for layer in adapter_config.vit_layers:
+            if layer >= 0:
+                self.vit_layers.append(layer)
+            else:
+                self.vit_layers.append(layer + vit_config.num_hidden_layers)
+        last_layer_needed = max(self.vit_layers) + 1
+        if last_layer_needed < vit_config.num_hidden_layers:
+            new_vit_config = deepcopy(vit_config)
+            new_vit_config.num_hidden_layers = last_layer_needed
+            self.image_vit = MolmoActVisionTransformer(new_vit_config)
+        else:
+            self.image_vit = MolmoActVisionTransformer(vit_config)
+        self.num_prefix_tokens: int = self.image_vit.num_prefix_tokens
+        # optional pad_embed
+        self.pad_embed = None
+        if adapter_config.image_padding_embed == "pad_and_partial_pad":
+            pool_dim = vit_config.hidden_size * len(adapter_config.vit_layers)
+            self.pad_embed = nn.Parameter(torch.zeros((2, pool_dim)))
+        pool_dim = vit_config.hidden_size * len(adapter_config.vit_layers)
+        self.image_pooling_2d = ViTMultiHeadDotProductAttention(
+            hidden_size=adapter_config.hidden_size,
+            num_heads=adapter_config.num_attention_heads,
+            num_key_value_heads=adapter_config.num_key_value_heads,
+            head_dim=adapter_config.head_dim,
+            input_dim=pool_dim,
+            float32_attention=adapter_config.float32_attention,
+            attention_dropout=adapter_config.attention_dropout,
+            residual_dropout=adapter_config.residual_dropout,
+            attn_implementation=adapter_config._attn_implementation,
+        )
+        self.image_projector = ImageProjectorMLP(
+            adapter_config.hidden_size,
+            adapter_config.intermediate_size,
+            adapter_config.text_hidden_size,
+            adapter_config.hidden_act,
+        )
+        self.image_feature_dropout = nn.Dropout(adapter_config.image_feature_dropout)
+    def encode_image(self, images: torch.Tensor) -> torch.Tensor:
+        """
+        : param images: (batch_size, num_crops, num_patch, n_pixels)
+        """
+        B, T, N, D = images.shape
+        images = images.view(B * T, N, D)
+        image_features = self.image_vit(images)
+        features = []
+        for layer in self.vit_layers:
+            features.append(image_features[layer])
+        image_features = torch.cat(features, dim=-1)
+        if self.num_prefix_tokens > 0:
+            image_features = image_features[:, 1:]
+        image_features = image_features.view(B, T, N, -1)
+        return image_features
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.image_vit.patch_embedding.weight.dtype
+    @property
+    def device(self) -> torch.device:
+        return self.image_vit.patch_embedding.weight.device
+    def forward(
+        self,
+        images: torch.Tensor,
+        pooled_patches_idx: torch.Tensor,
+        image_masks: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        # image_features: (batch_size, num_crops(=num_image), num_patch, nximage_emb_dim)
+        batch_size, num_image = images.shape[:2]
+        images = images.to(device=self.device, dtype=self.dtype)
+        image_features = self.encode_image(images)
+        # optional padding embeddings
+        if self.pad_embed is not None and image_masks is not None:
+            image_masks = image_masks.to(device=self.device)
+            all_pad = (image_masks == 0).to(image_features.dtype)
+            partial = torch.logical_and(image_masks < 1, ~ (image_masks == 0)).to(image_features.dtype)
+            image_features = image_features + self.pad_embed[0][None,None,None,:] * all_pad[...,None] \
+                            + self.pad_embed[1][None,None,None,:] * partial[...,None]
+        image_features = self.image_feature_dropout(image_features)
+        dim = image_features.shape[-1]
+        valid = pooled_patches_idx >= 0
+        valid_token = torch.any(valid, -1)
+        # Use `pooled_patches_idx` to arange the features for image pooling
+        batch_idx = torch.arange(pooled_patches_idx.shape[0], dtype=torch.long, device=pooled_patches_idx.device)
+        batch_idx = torch.tile(batch_idx.view(batch_size, 1, 1), [1, pooled_patches_idx.shape[1], pooled_patches_idx.shape[2]])
+        # Now [batch, num_high_res_features, pool_dim, dim]
+        to_pool = image_features.reshape(batch_size, -1, dim)[batch_idx, torch.clip(pooled_patches_idx, 0)]
+        to_pool = to_pool * valid.to(self.dtype)[:, :, :, None]
+        to_pool = to_pool.reshape([-1, pooled_patches_idx.shape[-1], dim])
+        query = to_pool.mean(-2, keepdim=True)
+        pooled_features = self.image_pooling_2d(query, to_pool)
+        pooled_features = pooled_features.reshape([batch_size, -1, pooled_features.shape[-1]])
+        # MLP layer to map the feature.
+        pooled_features = self.image_projector(pooled_features)
+        return pooled_features.view(-1, pooled_features.shape[-1])[valid_token.flatten()]
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding
+class MolmoActRotaryEmbedding(nn.Module):
+    def __init__(self, config: MolmoActLlmConfig, device: Union[str, torch.device] = None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+@use_kernel_forward_from_hub("RMSNorm")
+class MolmoActRMSNorm(nn.Module):
+    def __init__(
+        self,
+        size: int,
+        eps: float = 1e-6,
+        device: Union[str, torch.device] = None,
+    ):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(size, device=device))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        with torch.autocast(enabled=False, device_type=x.device.type):
+            og_dtype = x.dtype
+            x = x.to(torch.float32)
+            variance = x.pow(2).mean(-1, keepdim=True)
+            x = x * torch.rsqrt(variance + self.eps)
+            x = x.to(og_dtype)
+        return self.weight * x
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+class MolmoActAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    # copied from transformers.models.llama.modeling_llama.LlamaAttention.__init__ with Llama->MolmoAct
+    def __init__(self, config: MolmoActLlmConfig, layer_idx: Optional[int] = None) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.num_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.head_dim = config.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.is_causal = True
+        if (config.head_dim * config.num_attention_heads) != config.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {config.hidden_size}"
+                f" and `num_attention_heads`: {config.num_attention_heads})."
+            )
+        self.fused_dims = (
+            config.hidden_size,
+            config.head_dim * config.num_key_value_heads,
+            config.head_dim * config.num_key_value_heads,
+        )
+        self.att_proj = nn.Linear(
+            config.hidden_size,
+            sum(self.fused_dims),
+            bias=config.qkv_bias,
+        )
+        # Layer norms.
+        self.k_norm: Optional[MolmoActRMSNorm] = None
+        self.q_norm: Optional[MolmoActRMSNorm] = None
+        self.qk_norm_type: Optional[str] = None
+        if config.use_qk_norm:
+            k_norm_size = (
+                config.head_dim
+                if config.qk_norm_type == "qwen3" else
+                config.num_key_value_heads * config.head_dim
+            )
+            self.k_norm = MolmoActRMSNorm(k_norm_size, eps=config.layer_norm_eps)
+            q_norm_size = (
+                config.head_dim
+                if config.qk_norm_type == "qwen3" else
+                config.num_attention_heads * config.head_dim
+            )
+            self.q_norm = MolmoActRMSNorm(q_norm_size, eps=config.layer_norm_eps)
+            self.qk_norm_type = config.qk_norm_type
+        self.attention_dropout = config.attention_dropout
+        self.attn_out = nn.Linear(
+            config.hidden_size,
+            config.hidden_size,
+            bias=False,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        qkv = self.att_proj(hidden_states)
+        query_states, key_states, value_states = qkv.split(self.fused_dims, dim=-1)
+        value_states = value_states.view(hidden_shape)
+        # Optionally apply layer norm to keys and queries.
+        if self.q_norm is not None and self.k_norm is not None and self.qk_norm_type != "qwen3":
+            query_states = self.q_norm(query_states)
+            key_states = self.k_norm(key_states)
+        query_states = query_states.view(hidden_shape)
+        key_states = key_states.view(hidden_shape)
+        if self.q_norm is not None and self.k_norm is not None and self.qk_norm_type == "qwen3":
+            query_states = self.q_norm(query_states)
+            key_states = self.k_norm(key_states)
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.attn_out(attn_output)
+        return attn_output, attn_weights
+class LanguageModelMLP(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        intermediate_size: int,
+        hidden_act: str,
+        device: Union[str, torch.device] = None,
+    ):
+        super().__init__()
+        self.ff_proj = nn.Linear(input_dim, intermediate_size * 2, bias=False, device=device)
+        self.ff_out = nn.Linear(intermediate_size, input_dim, bias=False, device=device)
+        self.act = ACT2FN[hidden_act]
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.ff_proj(x)
+        x, gate = x.chunk(2, dim=-1)
+        x = self.act(gate) * x
+        x = self.ff_out(x)
+        return x
+class MolmoActDecoderLayer(GradientCheckpointingLayer):
+    def __init__(
+        self,
+        config: MolmoActLlmConfig,
+        layer_idx: Optional[int] = None,
+        device: Union[str, torch.device] = None
+    ):
+        super().__init__()
+        self.config = config
+        self.self_attn = MolmoActAttention(config, layer_idx)
+        self.attn_norm = MolmoActRMSNorm(
+            config.hidden_size, eps=config.layer_norm_eps, device=device)
+        self.dropout = nn.Dropout(config.residual_dropout)
+        self.mlp = LanguageModelMLP(
+            config.hidden_size, config.intermediate_size, config.hidden_act, device=device)
+        self.ff_norm = MolmoActRMSNorm(
+            config.hidden_size, eps=config.layer_norm_eps, device=device)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+        hidden_states = self.attn_norm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + self.dropout(hidden_states)
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.ff_norm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.dropout(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        return outputs
+class MolmoActPostNormDecoderLayer(MolmoActDecoderLayer):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = self.attn_norm(hidden_states)
+        hidden_states = residual + self.dropout(hidden_states)
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.ff_norm(hidden_states)
+        hidden_states = residual + self.dropout(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        return outputs
+class MolmoActEmbedding(nn.Module):
+    def __init__(
+        self,
+        num_embeddings: int,
+        num_new_embeddings: int,
+        features: int,
+        device: Union[str, torch.device] = None,
+    ):
+        super().__init__()
+        self.embedding = nn.Parameter(
+            torch.zeros(num_embeddings, features, device=device),
+        )
+        self.new_embedding = nn.Parameter(
+            torch.zeros(num_new_embeddings, features, device=device),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.embedding(x, torch.cat([self.embedding, self.new_embedding], dim=0))
+MOLMO2_TEXT_ONLY_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`CausalLMOutputWithPast`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+@add_start_docstrings(
+    "The bare MolmoAct text-only model outputting raw hidden-states without any specific head on top.",
+    MOLMO_START_DOCSTRING,
+)
+class MolmoActLlm(MolmoActPreTrainedModel):
+    def __init__(self, config: MolmoActLlmConfig):
+        super().__init__(config)
+        self.config = config
+        if config.additional_vocab_size is not None:
+            self.wte = MolmoActEmbedding(
+                config.vocab_size,
+                config.additional_vocab_size,
+                config.hidden_size,
+            )
+        else:
+            self.wte = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.emb_drop = nn.Dropout(config.embedding_dropout)
+        decoder_layer = MolmoActPostNormDecoderLayer if config.norm_after else MolmoActDecoderLayer
+        self.blocks = nn.ModuleList(
+            [decoder_layer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.ln_f = MolmoActRMSNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.rotary_emb = MolmoActRotaryEmbedding(config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> torch.nn.Module:
+        return self.wte
+    def set_input_embeddings(self, value: torch.nn.Module) -> None:
+        self.wte = value
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+        if inputs_embeds is None:
+            input_ids = input_ids * (input_ids != -1).to(input_ids.dtype)
+            inputs_embeds = self.wte(input_ids)
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        for decoder_block in self.blocks[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            layer_outputs = decoder_block(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **flash_attn_kwargs,
+            )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.ln_f(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+@add_start_docstrings(
+    "The MolmoAct text-only model which consists of a language model + lm head.",
+    MOLMO_START_DOCSTRING,
+)
+class MolmoActForCausalLM(MolmoActPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = []  # Weights are not tied
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    base_model_prefix = "model"
+    def __init__(self, config: MolmoActLlmConfig):
+        super().__init__(config)
+        self.model = MolmoActLlm(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> torch.nn.Module:
+        return self.model.wte
+    def set_input_embeddings(self, value: torch.nn.Module) -> None:
+        self.model.wte = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, value: torch.nn.Module) -> None:
+        self.lm_head = value
+    def set_decoder(self, decoder: torch.nn.Module) -> None:
+        self.model = decoder
+    def get_decoder(self) -> torch.nn.Module:
+        return self.model
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(MOLMO2_TEXT_ONLY_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        r"""
+        ```python
+        >>> from transformers import AutoTokenizer, MolmoActForCausalLM
+        >>> model = MolmoActForCausalLM.from_pretrained("...")
+        >>> tokenizer = AutoTokenizer.from_pretrained("...")
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+MOLMO2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        images (`torch.FloatTensor` of shape `(batch_size, n_crops, 27*27, 3*14*14)`, *optional*):
+            The input crops in with pixel values between 0 and 1 and normalized with SigLIP2 mean/std
+            Each crop contains 27x27 patches with 14*14*3 pixel values
+        image_masks  (`torch.FloatTensor` of shape `(batch_size, n_crops, n_patches, n_features)`, *optional*):
+            Image masks showing what percent of each patch is paddding
+        pooled_patches_idx (`torch.LongTensor` of shape `(batch_size, n_image_tokens, n_pooled_patches)`):
+            For each patch_id tokens in `input_ids`, the indices of the patches in `images`
+            to pool for that token, masked with -1
+            means ignore the patch.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`MolmoActCausalLMOutputWithPast`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+@add_start_docstrings(
+    "The bare MolmoAct model outputting raw hidden-states without any specific head on top.",
+    MOLMO_START_DOCSTRING,
+)
+class MolmoActModel(MolmoActPreTrainedModel):
+    _checkpoint_conversion_mapping = {}
+    def __init__(self, config: MolmoActConfig):
+        super().__init__(config)
+        self.transformer: MolmoActLlm = MolmoActLlm(config.llm_config)
+        self.vision_backbone: Optional[MolmoActVisionBackbone] = None
+        if config.vit_config is not None and config.adapter_config is not None:
+            self.vision_backbone = MolmoActVisionBackbone(config.vit_config, config.adapter_config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> torch.nn.Module:
+        return self.transformer.wte
+    def set_input_embeddings(self, value: torch.nn.Module) -> None:
+        self.transformer.wte = value
+    @property
+    def device(self) -> torch.device:
+        return self.transformer.ln_f.weight.device
+    def build_input_embeddings(
+        self,
+        input_ids: torch.LongTensor,
+        images: Optional[torch.FloatTensor] = None,  # image inputs
+        image_masks: Optional[torch.Tensor] = None,
+        pooled_patches_idx: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        # Get embeddings of input.
+        # shape: (batch_size, seq_len, d_model)
+        input_ids = input_ids * (input_ids != -1).to(input_ids.dtype)
+        x = self.transformer.wte(input_ids)
+        image_features: Optional[torch.FloatTensor] = None
+        if images is not None:
+            image_features = self.vision_backbone(images, pooled_patches_idx)
+            is_image_patch = input_ids.view(-1) == self.config.image_patch_id
+            assert is_image_patch.sum() == len(image_features)
+            x.view(-1, x.shape[-1])[is_image_patch] += image_features
+        # shape: (batch_size, seq_len, d_model)
+        x = self.transformer.emb_drop(x)  # type: ignore
+        return x, image_features
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_masks: Optional[torch.Tensor] = None,
+        pooled_patches_idx: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, MolmoActModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if images is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both images and inputs_embeds at the same time."
+            )
+        if inputs_embeds is None:
+            inputs_embeds, image_features = self.build_input_embeddings(
+                input_ids, images, image_masks, pooled_patches_idx)
+        outputs = self.transformer(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+        )
+        return MolmoActModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if images is not None else None,
+        )
+@add_start_docstrings(
+    "The MolmoAct model which consists of a vision backbone and a language model + lm head.",
+    MOLMO_START_DOCSTRING,
+)
+class MolmoActForActionReasoning(MolmoActPreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {}
+    _tied_weights_keys = []  # Weights are not tied
+    config_class = MolmoActConfig
+    def __init__(self, config: MolmoActConfig):
+        super().__init__(config)
+        self.model = MolmoActModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.vocab_size = config.vocab_size
+        # Initialize weights and apply final processing
+        self.post_init()
+        # --- Action parsing / de-tokenization setup ---
+        # Stats dict expected under config.norm_stats (per-dataset key). If missing, default to empty.
+        self.norm_stats = getattr(config, "norm_stats", None) or {}
+        # Number of discretization bins used for action tokens, defaults to 256.
+        self.n_action_bins = getattr(config, "n_action_bins", 256)
+        # Precompute bin centers in [-1, 1] for inverse token to value mapping.
+        self.bins = np.linspace(-1.0, 1.0, self.n_action_bins)
+        self.bin_centers = (self.bins[:-1] + self.bins[1:]) / 2.0
+        # Lazily constructed tokenizer for converting token strings to ids
+        self._qwen_tokenizer = None
+    def get_input_embeddings(self) -> torch.nn.Module:
+        return self.model.transformer.wte
+    def set_input_embeddings(self, value: torch.nn.Module) -> None:
+        self.model.transformer.wte = value
+    def get_output_embeddings(self):
+        self.lm_head
+    def set_output_embeddings(self, value: torch.nn.Module) -> None:
+        self.lm_head = value
+    # Make modules available throught conditional class for BC
+    @property
+    def language_model(self) -> torch.nn.Module:
+        return self.model.transformer
+    @property
+    def vision_backbone(self) -> torch.nn.Module:
+        return self.model.vision_backbone
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(MOLMO2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        images: Optional[torch.Tensor] = None,
+        image_masks: Optional[torch.Tensor] = None,
+        pooled_patches_idx: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> Union[Tuple, MolmoActCausalLMOutputWithPast]:
+        r"""
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, MolmoActForActionReasoning
+        >>> model = MolmoActForActionReasoning.from_pretrained("...")
+        >>> processor = AutoProcessor.from_pretrained("...")
+        >>> prompt = "What's the content of the image?"
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, text=prompt, apply_chat_template=True, return_tensors="pt")
+        >>> # Generate
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=15)
+        >>> generated_tokens = generated_ids[:, inputs['input_ids'].size(1):]
+        >>> processor.batch_decode(generated_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "The image features a busy city street with a stop sign prominently displayed"
+        ```"""
+        outputs = self.model(
+            input_ids=input_ids,
+            images=images,
+            image_masks=image_masks,
+            pooled_patches_idx=pooled_patches_idx,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs.last_hidden_state
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.vocab_size)
+        return MolmoActCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+    # ===== Utilities for action parsing / un-normalization =====
+    def _check_unnorm_key(self, unnorm_key: Optional[str]) -> str:
+        """Validate and resolve which dataset key to use from self.norm_stats."""
+        if not self.norm_stats:
+            raise ValueError("No norm_stats found in config; cannot unnormalize actions.")
+        if unnorm_key is None:
+            if len(self.norm_stats) != 1:
+                raise ValueError(
+                    f"Model has multiple dataset stats; please pass `unnorm_key` from {list(self.norm_stats.keys())}"
+                )
+            return next(iter(self.norm_stats.keys()))
+        if unnorm_key not in self.norm_stats:
+            raise ValueError(f"`unnorm_key`={unnorm_key!r} not in {list(self.norm_stats.keys())}")
+        return unnorm_key
+    def get_action_dim(self, unnorm_key: Optional[str] = None) -> int:
+        """Return action dimensionality from q01 stats length for the dataset key."""
+        key = self._check_unnorm_key(unnorm_key)
+        return len(self.norm_stats[key]["action"]["q01"])
+    def get_action_stats(self, unnorm_key: Optional[str] = None) -> Dict[str, Any]:
+        """Return the full action stats dict for a given dataset key."""
+        key = self._check_unnorm_key(unnorm_key)
+        return self.norm_stats[key]["action"]
+    @torch.no_grad()
+    def parse_action(self, text: str, unnorm_key: Optional[str] = None) -> list:
+        """
+        Parse a generated text to extract one 1×D action token list, decode to continuous values,
+        and unnormalize using dataset-specific stats from `config.norm_stats`.
+        This follows the pipeline used in `experiments/robot/libero/main_libero_10_evaluation.py`:
+        - Find bracketed token lists following the phrase "the action that the robot should take is" (case-insensitive),
+          falling back to any bracketed list in the text.
+        - Convert token strings → ids via Qwen2Tokenizer.
+        - Map ids → discretized bin indices using: `discretized = vocab_size - token_id - 1` (clipped to bins)
+        - Convert bins → normalized actions in [-1, 1] using precomputed `bin_centers`.
+        - Unnormalize with q01/q99 and optional `mask` from norm_stats.
+        Returns:
+            List[float]: unnormalized action vector of length D.
+        """
+        # Resolve action dimension and stats
+        action_dim = self.get_action_dim(unnorm_key)
+        stats = self.get_action_stats(unnorm_key)
+        q01 = np.asarray(stats["q01"], dtype=np.float32)
+        q99 = np.asarray(stats["q99"], dtype=np.float32)
+        mask = np.asarray(stats.get("mask", np.ones_like(q01, dtype=bool)), dtype=bool)
+        # Lazily load the tokenizer (shared across calls)
+        if self._qwen_tokenizer is None:
+            self._qwen_tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen2-7B")
+        token_lists = extract_action_token_lists(text, only_len=action_dim)
+        action_lists = []
+        # Choose the first list (temporal aggregation, if any, should be done by the caller)
+        for tokens in token_lists:
+            # Convert tokens → ids (replace None with vocab_size to avoid negatives)
+            ids = self._qwen_tokenizer.convert_tokens_to_ids(tokens)
+            ids = [self._qwen_tokenizer.vocab_size if i is None else int(i) for i in ids]
+            ids = np.asarray(ids, dtype=np.int64)
+            # ids → discretized bin indices → normalized actions in [-1, 1]
+            discretized = self._qwen_tokenizer.vocab_size - ids
+            discretized = np.clip(discretized - 1, a_min=0, a_max=self.bin_centers.shape[0] - 1)
+            normalized = self.bin_centers[discretized]
+            # Unnormalize using per-dimension statistics
+            unnorm = 0.5 * (normalized + 1.0) * (q99 - q01) + q01
+            actions = np.where(mask, unnorm, normalized)
+            action_lists.append([float(x) for x in actions])
+        # Return a Python list of float actions
+        return action_lists
+    @torch.no_grad()
+    def parse_trace(self, text: str) -> list:
+        return extract_trace_lists(text, point_len=2, min_points=1)
+    @torch.no_grad()
+    def parse_depth(self, text: str) -> list:
+        return extract_depth_string(text, include_tags=True)
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_masks: Optional[torch.Tensor] = None,
+        pooled_patches_idx: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Optional[Union[int, torch.Tensor]] = None,
+        **kwargs,
+    ):
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+        if cache_position[0] == 0:
+            model_inputs["images"] = images
+            model_inputs["pooled_patches_idx"] = pooled_patches_idx
+            model_inputs["image_masks"] = image_masks
+        return model_inputs
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        num_new_tokens: int = 1,
+    ) -> Dict[str, Any]:
+        if model_kwargs["use_cache"] and "images" in model_kwargs:
+            # After the first step, no long pass the images into forward since the images tokens
+            # are already cached
+            for k in ["images", "image_masks", "pooled_patches_idx"]:
+                del model_kwargs[k]
+        return super()._update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder, num_new_tokens)
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+# Always register for multi-modal features
+AutoModelForImageTextToText.register(MolmoActConfig, MolmoActForActionReasoning)
+AutoModelForCausalLM.register(MolmoActLlmConfig, MolmoActForCausalLM)

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "image_processing_molmoact.MolmoActImageProcessor",
+    "AutoProcessor": "processing_molmoact.MolmoActProcessor"
+  },
+  "base_image_input_size": [
+    336,
+    336
+  ],
+  "crop_mode": "overlap-and-resize-c2",
+  "do_convert_rgb": true,
+  "do_pad": true,
+  "image_patch_size": 14,
+  "image_pooling_h": 2,
+  "image_pooling_w": 2,
+  "image_processor_type": "MolmoActImageProcessor",
+  "max_crops": 8,
+  "max_multi_image_crops": 8,
+  "normalize_mode": "openai",
+  "overlap_margins": [
+    4,
+    4
+  ],
+  "pad_value": 0.0,
+  "processor_class": "MolmoActProcessor",
+  "resize_mode": "default"
+}

processing_molmoact.py ADDED Viewed

	@@ -0,0 +1,465 @@

+"""
+Processor class for MolmoAct.
+"""
+from typing import List, Optional, Union, Dict, Tuple
+import PIL
+from PIL import ImageFile, ImageOps
+try:
+    from typing import Unpack
+except ImportError:
+    from typing_extensions import Unpack
+import numpy as np
+import torch
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import (
+    ProcessingKwargs,
+    ProcessorMixin,
+)
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.tokenization_utils_base import TextInput, PreTokenizedInput
+from transformers.utils import logging
+from transformers import AutoTokenizer
+from .image_processing_molmoact import MolmoActImagesKwargs, MolmoActImageProcessor
+logger = logging.get_logger(__name__)
+# Special tokens, these should be present in any tokenizer we use since the preprocessor uses them
+IMAGE_PATCH_TOKEN = f"<im_patch>"  # Where to insert high-res tokens
+IMAGE_LOW_RES_TOKEN = f"<im_low>"  # Where to insert low-res tokens
+IM_START_TOKEN = f"<im_start>"
+IM_END_TOKEN = f"<im_end>"
+IM_COL_TOKEN = f"<im_col>"
+IMAGE_PROMPT = "<|image|>"
+EXTRA_TOKENS = (IM_START_TOKEN, IM_END_TOKEN, IMAGE_PATCH_TOKEN,
+                IM_COL_TOKEN, IMAGE_PROMPT, IMAGE_LOW_RES_TOKEN)
+DEMO_STYLES = [
+    "point_count",
+    "pointing",
+    "cosyn_point",
+    "user_qa",
+    "long_caption",
+    "short_caption",
+    "video_long_caption",
+    "video_short_caption",
+    "correction_qa",
+    "demo",
+    "android_control",
+]
+def setup_pil():
+    PIL.Image.MAX_IMAGE_PIXELS = None
+    ImageFile.LOAD_TRUNCATED_IMAGES = True
+def get_special_token_ids(tokenizer: AutoTokenizer) -> Dict[str, int]:
+    ids = tokenizer.encode("".join(EXTRA_TOKENS), add_special_tokens=False)
+    assert len(ids) == len(EXTRA_TOKENS)
+    return {k: i for k, i in zip(EXTRA_TOKENS, ids)}
+def load_image(image: Union[PIL.Image.Image, np.ndarray]) -> np.ndarray:
+    """Load image"""
+    setup_pil()
+    if isinstance(image, PIL.Image.Image):
+        image = image.convert("RGB")
+        image = ImageOps.exif_transpose(image)
+        return np.array(image)
+    elif isinstance(image, np.ndarray):
+        assert len(image.shape) == 3, "Image should have 3 dimensions"
+        assert image.shape[2] == 3, "Image should have 3 channels"
+        assert image.dtype == np.uint8, "Image should have uint8 type"
+        return image
+    else:
+        raise ValueError("Image should be PIL.Image or np.ndarray")
+class MolmoActProcessorKwargs(ProcessingKwargs, total=False):
+    """MolmoAct processor kwargs"""
+    images_kwargs: MolmoActImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+    }
+class MolmoActProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    optional_attributes = [
+        "chat_template",
+        "prompt_templates",
+        "message_format",
+        "system_prompt",
+        "style",
+        "always_start_with_space",
+        "default_inference_len",
+        "use_col_tokens",
+        "image_padding_mask",
+    ]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(
+        self,
+        image_processor: MolmoActImageProcessor = None,
+        tokenizer: AutoTokenizer = None,
+        chat_template: Optional[str] = None,
+        prompt_templates: Optional[str] = "uber_model",
+        message_format: Optional[str] = "role",
+        system_prompt: Optional[str] = "demo_or_style",
+        style: Optional[str] = "demo",
+        always_start_with_space: Optional[bool] = False,
+        default_inference_len: Optional[int] = 65,
+        use_col_tokens: Optional[bool] = True,
+        image_padding_mask: bool = False,
+        **kwargs
+    ) -> None:
+        if tokenizer.padding_side != "left":
+            logger.warning(f"Tokenizer {tokenizer.name_or_path} is not left-padded, padding side will be set to left")
+            tokenizer.padding_side = "left"  # type: ignore
+        super().__init__(
+            image_processor,
+            tokenizer,
+            chat_template=chat_template,
+            prompt_templates=prompt_templates,
+            message_format=message_format,
+            system_prompt=system_prompt,
+            style=style,
+            always_start_with_space=always_start_with_space,
+            default_inference_len=default_inference_len,
+            use_col_tokens=use_col_tokens,
+            image_padding_mask=image_padding_mask,
+        )
+        self._special_tokens = None
+    @property
+    def special_token_ids(self):
+        if self._special_tokens is None:
+            self._special_tokens = get_special_token_ids(self.tokenizer)
+        return self._special_tokens
+    def get_user_prompt(self, text: TextInput) -> str:
+        """Get user prompt"""
+        if self.prompt_templates == "none":
+            return ""
+        elif self.prompt_templates == "uber_model":
+            return text
+        else:
+            raise NotImplementedError(self.prompt_templates)
+    def get_prefix(self) -> str:
+        """Get prefix"""
+        if self.system_prompt == "style_and_length":  # captioner
+            assert self.style in ["long_caption"]
+            style = self.style
+            n = None if self.default_inference_len is None else str(self.default_inference_len)
+            if n is not None and len(n) > 0:  # allow empty string to signal unconditioned
+                prefix = style + " " + n + ":"
+            else:
+                prefix = style + " :"
+        elif self.system_prompt == "demo_or_style":  # demo model
+            if self.style in DEMO_STYLES:
+                prefix = ""
+            else:
+                prefix = self.style + ":"
+        else:
+            raise NotImplementedError(self.system_prompt)
+        return prefix
+    def format_prompt(self, prompt: str) -> str:
+        """Format prompt"""
+        if self.message_format == "none":
+            pass
+        elif self.message_format == "role":
+            prompt = "User: " + prompt + " Assistant:"
+        else:
+            raise NotImplementedError(self.message_format)
+        if self.always_start_with_space:
+            prompt = " " + prompt
+        return prompt
+    def get_prompt(self, text: TextInput) -> str:
+        prompt = self.get_user_prompt(text)
+        if self.system_prompt and self.system_prompt != "none":
+            prefix = self.get_prefix()
+            if len(prefix) > 0 and len(prompt) > 0:
+                prompt = prefix + " " + prompt
+            elif len(prefix) > 0:
+                prompt = prefix
+        prompt = self.format_prompt(prompt)
+        return prompt
+    def get_image_tokens(self, image_grid: np.ndarray):
+        joint = []
+        for h, w in image_grid:
+            per_row = np.full(w, IMAGE_PATCH_TOKEN)
+            if self.use_col_tokens:
+                per_row = np.concatenate([per_row, [IM_COL_TOKEN]], 0)
+            extra_tokens = np.tile(per_row, [h])
+            joint += [
+                [IM_START_TOKEN],
+                extra_tokens,
+                [IM_END_TOKEN],
+            ]
+        return np.concatenate(joint)
+    def insert_bos_numpy(
+        self,
+        input_ids: np.ndarray,
+        attention_mask: np.ndarray,
+        bos_token_id: int,
+        pad_token_id: int,
+    ):
+        """
+        Args:
+            input_ids: [B, S] array with left padding
+            attention_mask: [B, S] array (0 for pad, 1 for valid)
+            bos_token_id: int
+            pad_token_id: int
+        Returns:
+            input_ids_out: [B, S] or [B, S+1] array with bos inserted if needed
+            attention_mask_out: same shape as input_ids_out
+        """
+        need_to_expand = len(input_ids.shape) == 1
+        if need_to_expand:
+            input_ids = input_ids[None, :]
+            attention_mask = attention_mask[None, :]
+        B, S = input_ids.shape
+        # Handle zero-length sequence
+        if S == 0:
+            new_input_ids = np.full((B, 1), bos_token_id, dtype=input_ids.dtype)
+            new_attention_mask = np.ones((B, 1), dtype=attention_mask.dtype)
+            if need_to_expand:
+                new_input_ids = new_input_ids[0]
+                new_attention_mask = new_attention_mask[0]
+            return new_input_ids, new_attention_mask
+        first_valid_index = (attention_mask == 1).argmax(axis=-1)  # [B]
+        bos_already_present = np.all(input_ids[np.arange(B), first_valid_index] == bos_token_id)
+        if bos_already_present:
+            if need_to_expand:
+                input_ids = input_ids[0]
+                attention_mask = attention_mask[0]
+            return input_ids, attention_mask
+        else:
+            new_input_ids = np.full((B, S+1), pad_token_id, dtype=input_ids.dtype)
+            new_attention_mask = np.zeros((B, S+1), dtype=attention_mask.dtype)
+            src_idx = np.tile(np.arange(S), (B, 1))  # [B, S]
+            valid_mask = src_idx >= first_valid_index[:, None]  # [B, S]
+            tgt_idx = src_idx + 1  # shit right
+            batch_idx = np.tile(np.arange(B)[:, None], (1, S))  # [B, S]
+            # flatten valid_positions
+            flat_vals = input_ids[valid_mask]
+            flat_batch = batch_idx[valid_mask]
+            flat_tgt = tgt_idx[valid_mask]
+            new_input_ids[flat_batch, flat_tgt] = flat_vals
+            new_attention_mask[flat_batch, flat_tgt] = 1
+            insert_pos = first_valid_index
+            new_input_ids[np.arange(B), insert_pos] = bos_token_id
+            new_attention_mask[np.arange(B), insert_pos] = 1
+            if need_to_expand:
+                new_input_ids = new_input_ids[0]
+                new_attention_mask = new_attention_mask[0]
+            return new_input_ids, new_attention_mask
+    def insert_bos_torch(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        bos_token_id: int,
+        pad_token_id: int,
+    ):
+        """
+        Args:
+            input_ids: [B, S] tensor with left padding
+            attention_mask: [B, S] tensor (0 for pad, 1 for valid)
+            bos_token_id: int
+            pad_token_id: int
+        Returns:
+            input_ids_out: [B, S] or [B, S+1] tensor with bos inserted if needed
+            attention_mask_out: same shape as input_ids_out
+        """
+        B, S = input_ids.shape
+        device = input_ids.device
+        # Handle zero-length sequence
+        if S == 0:
+            new_input_ids = torch.full((B, 1), bos_token_id, dtype=input_ids.dtype, device=device)
+            new_attention_mask = torch.ones((B, 1), dtype=attention_mask.dtype, device=device)
+            return new_input_ids, new_attention_mask
+        first_valid_index = (attention_mask == 1).long().argmax(dim=-1)  # [B]
+        bos_already_present = (input_ids[torch.arange(B), first_valid_index] == bos_token_id).all()
+        if bos_already_present:
+            return input_ids, attention_mask
+        else:
+            new_input_ids = torch.full((B, S+1), pad_token_id, dtype=input_ids.dtype, device=device)
+            new_attention_mask = torch.zeros((B, S+1), dtype=attention_mask.dtype, device=device)
+            src_idx = torch.arange(S, device=device).expand(B, S)  # [B, S]
+            valid_mask = src_idx >= first_valid_index.unsqueeze(1)  # [B, S]
+            tgt_idx = src_idx + 1  # shift right
+            batch_idx = torch.arange(B, device=device).unsqueeze(1).expand_as(src_idx)
+            flat_vals = input_ids[valid_mask]
+            flat_batch = batch_idx[valid_mask]
+            flat_tgt = tgt_idx[valid_mask]
+            new_input_ids[flat_batch, flat_tgt] = flat_vals
+            new_attention_mask[flat_batch, flat_tgt] = 1
+            insert_pos = first_valid_index
+            batch_indices = torch.arange(B, device=device)
+            new_input_ids[batch_indices, insert_pos] = bos_token_id
+            new_attention_mask[batch_indices, insert_pos] = 1
+            return new_input_ids, new_attention_mask
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: Union[ImageInput, List[ImageInput]] = None,
+        apply_chat_template: bool = False,
+        **kwargs: Unpack[MolmoActProcessorKwargs],
+    ) -> BatchFeature:
+        if images is None and text is None:
+            raise ValueError("You have to specify at least one of `images` or `text`.")
+        output_kwargs = self._merge_kwargs(
+            MolmoActProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if isinstance(text, (list, tuple)) and isinstance(images, (list, tuple)):
+            if len(text) != len(images):
+                raise ValueError("You have to provide the same number of text and images")
+            if len(text) > 1 and not output_kwargs["text_kwargs"].get("padding", False):
+                raise ValueError("You have to specify padding when you have multiple text inputs")
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+        if images is not None:
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+        else:
+            image_inputs = {}
+        if apply_chat_template:
+            text = [self.get_prompt(t) for t in text]
+        prompt_strings = text
+        if image_inputs.get("images", None) is not None:
+            prompt_strings = []
+            for idx, image_grids in enumerate(image_inputs.pop("image_grids")):
+                if isinstance(image_grids, torch.Tensor):
+                    image_grids = image_grids.cpu().numpy()
+                if isinstance(images, (list, tuple)) and isinstance(images[idx], (list, tuple)):
+                    image_grids = image_grids[~np.all(image_grids == -1, axis=-1)]
+                    offset = 2 if len(images[idx]) < len(image_grids) else 1 # whether to use both low and high res images
+                    all_image_strings = []
+                    for i in range(0, len(image_grids), offset):
+                        image_grids_i = image_grids[i:i+offset]
+                        image_tokens = self.get_image_tokens(image_grids_i)
+                        img_ix = i // offset
+                        all_image_strings.append(f"Image {img_ix + 1}" + "".join(image_tokens))
+                    image_string = "".join(all_image_strings)
+                    prompt_strings.append(image_string + text[idx])
+                else:
+                    image_grids = image_grids[~np.all(image_grids == -1, axis=-1)]
+                    assert len(image_grids) in [1, 2], "Only one or two crops are supported for single image inputs"
+                    image_tokens = self.get_image_tokens(image_grids)
+                    image_string = "".join(image_tokens)
+                    prompt_strings.append(image_string + text[idx])
+        text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
+        input_ids = text_inputs["input_ids"]
+        attention_mask = text_inputs["attention_mask"]
+        is_list = isinstance(input_ids, (list, tuple))
+        if is_list:
+            input_ids = np.array(input_ids)
+            attention_mask = np.array(attention_mask)
+        use_numpy = isinstance(attention_mask, np.ndarray)
+        if use_numpy and np.issubdtype(input_ids.dtype, np.floating):
+            input_ids = input_ids.astype(np.int64)
+            attention_mask = attention_mask.astype(np.int64)
+        elif not use_numpy and torch.is_floating_point(input_ids):
+            input_ids = input_ids.to(torch.int64)
+            attention_mask = attention_mask.to(torch.int64)
+        bos = self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
+        if use_numpy:
+            input_ids, attention_mask = self.insert_bos_numpy(
+                input_ids, attention_mask, bos, self.tokenizer.pad_token_id
+            )
+        else:
+            input_ids, attention_mask = self.insert_bos_torch(
+                input_ids, attention_mask, bos, self.tokenizer.pad_token_id
+            )
+        if is_list:
+            input_ids = input_ids.tolist()  # type: ignore
+            attention_mask = attention_mask.tolist()  # type: ignore
+        text_inputs["input_ids"] = input_ids
+        text_inputs["attention_mask"] = attention_mask
+        if kwargs.get("device", None) is not None:
+            text_inputs = text_inputs.to(device=kwargs.get("device"), non_blocking=True)
+        # there is no bos token in Qwen tokenizer
+        return BatchFeature(
+            data={**text_inputs, **image_inputs}, tensor_type=output_kwargs["common_kwargs"]["return_tensors"]
+        )
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+MolmoActProcessor.register_for_auto_class()

processor_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "always_start_with_space": false,
+  "auto_map": {
+    "AutoProcessor": "processing_molmoact.MolmoActProcessor"
+  },
+  "default_inference_len": 65,
+  "image_padding_mask": true,
+  "message_format": "role",
+  "processor_class": "MolmoActProcessor",
+  "prompt_templates": "uber_model",
+  "style": "demo",
+  "system_prompt": "demo_or_style",
+  "use_col_tokens": true
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,3266 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "|<EXTRA_TOKENS_0>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_1>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_2>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_3>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_4>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_5>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_6>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_7>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_8>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_9>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_10>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_11>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_12>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_13>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_14>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_15>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_16>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_17>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_18>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_19>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_20>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_21>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_22>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_23>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_24>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_25>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_26>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_27>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_28>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_29>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_30>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_31>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_32>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_33>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_34>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_35>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_36>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_37>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_38>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_39>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_40>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_41>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_42>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_43>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_44>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_45>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_46>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_47>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_48>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_49>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_50>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_51>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_52>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_53>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_54>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_55>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_56>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_57>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_58>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_59>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_60>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_61>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_62>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_63>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_64>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_65>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_66>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_67>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_68>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_69>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_70>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_71>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_72>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_73>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_74>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_75>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_76>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_77>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_78>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_79>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_80>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_81>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_82>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_83>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_84>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_85>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_86>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_87>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_88>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_89>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_90>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_91>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_92>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_93>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_94>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_95>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_96>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_97>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_98>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_99>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_100>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_101>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_102>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_103>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_104>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_105>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_106>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_107>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_108>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_109>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_110>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_111>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_112>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_113>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_114>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_115>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_116>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_117>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_118>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_119>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_120>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_121>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_122>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_123>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_124>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_125>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_126>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_127>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_128>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_129>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_130>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_131>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_132>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_133>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_134>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_135>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_136>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_137>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_138>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_139>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_140>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_141>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_142>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_143>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_144>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_145>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_146>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_147>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_148>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_149>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_150>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_151>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_152>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_153>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_154>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_155>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_156>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_157>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_158>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_159>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_160>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_161>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_162>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_163>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_164>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_165>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_166>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_167>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_168>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_169>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_170>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_171>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_172>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_173>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_174>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_175>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_176>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_177>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_178>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_179>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_180>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_181>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_182>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_183>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_184>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_185>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_186>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_187>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_188>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_189>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_190>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_191>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_192>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_193>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_194>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_195>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_196>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_197>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_198>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_199>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_200>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_201>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_202>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_203>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_204>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_205>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_206>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_207>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_208>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_209>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_210>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_211>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_212>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_213>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_214>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_215>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_216>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_217>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_218>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_219>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_220>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_221>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_222>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_223>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_224>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_225>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_226>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_227>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_228>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_229>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_230>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_231>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_232>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_233>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_234>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_235>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_236>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_237>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_238>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_239>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_240>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_241>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_242>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_243>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_244>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_245>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_246>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_247>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_248>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_249>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_250>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_251>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_252>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_253>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_254>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_255>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_256>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_257>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_258>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_259>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_260>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_261>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_262>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_263>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_264>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_265>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_266>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_267>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_268>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_269>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_270>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_271>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_272>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_273>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_274>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_275>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_276>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_277>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_278>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_279>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_280>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_281>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_282>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_283>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_284>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_285>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_286>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_287>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_288>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_289>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_290>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_291>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_292>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_293>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_294>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_295>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_296>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_297>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_298>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_299>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_300>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_301>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_302>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_303>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_304>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_305>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_306>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_307>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_308>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_309>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_310>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_311>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_312>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_313>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_314>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_315>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_316>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_317>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_318>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_319>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_320>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_321>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_322>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_323>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_324>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_325>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_326>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_327>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_328>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_329>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_330>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_331>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_332>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_333>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_334>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_335>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_336>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_337>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_338>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_339>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_340>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_341>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_342>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_343>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_344>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_345>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_346>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_347>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_348>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_349>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_350>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_351>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_352>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_353>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_354>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_355>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_356>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_357>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_358>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_359>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_360>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_361>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_362>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_363>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_364>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_365>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_366>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_367>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_368>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_369>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_370>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_371>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_372>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_373>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_374>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_375>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_376>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_377>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_378>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_379>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_380>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_381>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_382>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_383>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_384>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_385>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_386>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_387>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_388>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_389>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_390>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_391>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_392>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_393>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_394>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_395>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_396>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_397>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_398>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_399>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_400>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_401>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_402>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_403>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_404>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_405>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_406>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_407>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_408>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_409>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_410>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_411>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_412>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_413>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_414>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_415>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_416>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_417>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_418>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_419>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_420>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_421>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_422>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_423>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_424>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_425>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_426>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_427>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_428>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_429>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_430>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_431>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_432>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_433>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_434>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_435>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_436>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_437>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_438>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_439>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_440>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_441>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_442>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_443>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_444>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_445>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_446>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_447>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_448>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_449>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_450>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_451>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_452>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_453>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_454>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "|<EXTRA_TOKENS_455>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<im_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<im_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<im_patch>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<im_col>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<im_low>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|pad|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff