Eldar Kurtic commited on
Commit
d7a8773
·
unverified ·
1 Parent(s): 1dbd70e

add new ckpt with compressed shared experts as well

Browse files
Files changed (48) hide show
  1. README.md +0 -19
  2. config.json +0 -144
  3. config.json.original +0 -817
  4. model-00001-of-00025.safetensors → model-00001-of-00023.safetensors +0 -0
  5. model-00004-of-00025.safetensors → model-00002-of-00023.safetensors +2 -2
  6. model-00005-of-00025.safetensors → model-00003-of-00023.safetensors +2 -2
  7. model-00002-of-00025.safetensors → model-00004-of-00023.safetensors +2 -2
  8. model-00003-of-00025.safetensors → model-00005-of-00023.safetensors +2 -2
  9. model-00006-of-00023.safetensors +3 -0
  10. model-00006-of-00025.safetensors +0 -3
  11. model-00007-of-00023.safetensors +3 -0
  12. model-00007-of-00025.safetensors +0 -3
  13. model-00008-of-00023.safetensors +3 -0
  14. model-00008-of-00025.safetensors +0 -3
  15. model-00009-of-00023.safetensors +3 -0
  16. model-00009-of-00025.safetensors +0 -3
  17. model-00010-of-00023.safetensors +3 -0
  18. model-00010-of-00025.safetensors +0 -3
  19. model-00011-of-00023.safetensors +3 -0
  20. model-00011-of-00025.safetensors +0 -3
  21. model-00012-of-00023.safetensors +3 -0
  22. model-00012-of-00025.safetensors +0 -3
  23. model-00013-of-00023.safetensors +3 -0
  24. model-00013-of-00025.safetensors +0 -3
  25. model-00014-of-00023.safetensors +3 -0
  26. model-00014-of-00025.safetensors +0 -3
  27. model-00015-of-00023.safetensors +3 -0
  28. model-00015-of-00025.safetensors +0 -3
  29. model-00016-of-00023.safetensors +3 -0
  30. model-00016-of-00025.safetensors +0 -3
  31. model-00017-of-00023.safetensors +3 -0
  32. model-00017-of-00025.safetensors +0 -3
  33. model-00018-of-00023.safetensors +3 -0
  34. model-00018-of-00025.safetensors +0 -3
  35. model-00019-of-00023.safetensors +3 -0
  36. model-00019-of-00025.safetensors +0 -3
  37. model-00020-of-00023.safetensors +3 -0
  38. model-00020-of-00025.safetensors +0 -3
  39. model-00021-of-00023.safetensors +3 -0
  40. model-00021-of-00025.safetensors +0 -3
  41. model-00022-of-00023.safetensors +3 -0
  42. model-00022-of-00025.safetensors +0 -3
  43. model-00023-of-00023.safetensors +3 -0
  44. model-00023-of-00025.safetensors +0 -3
  45. model-00024-of-00025.safetensors +0 -3
  46. model-00025-of-00025.safetensors +0 -3
  47. model.safetensors.index.json +0 -0
  48. recipe.yaml +1 -3
README.md DELETED
@@ -1,19 +0,0 @@
1
- ## More details and evals coming soon...
2
-
3
- ## Sanity check - GSM8k eval
4
-
5
- vllm (pretrained=/home/eldar/Llama-4-Scout-17B-16E-Instruct-ForQuant,tensor_parallel_size=4,max_model_len=4096,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: auto
6
-
7
- - `meta-llama/Llama-4-Scout-17B-16E-Instruct` unquantized baseline
8
-
9
- |Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr|
10
- |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
11
- |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.9189|± |0.0075|
12
- | | |strict-match | 5|exact_match|↑ |0.9014|± |0.0082|
13
-
14
- - `RedHatAI/Llama-4-Scout-17B-16E-Instruct-FP8-dynamic` FP8 quantized (this model)
15
-
16
- |Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr|
17
- |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
18
- |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.9219|± |0.0074|
19
- | | |strict-match | 5|exact_match|↑ |0.9075|± |0.0080|
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json CHANGED
@@ -326,385 +326,241 @@
326
  "language_model.model.layers.0.self_attn.v_proj",
327
  "language_model.model.layers.0.self_attn.o_proj",
328
  "language_model.model.layers.0.feed_forward.router",
329
- "language_model.model.layers.0.feed_forward.shared_expert.gate_proj",
330
- "language_model.model.layers.0.feed_forward.shared_expert.up_proj",
331
- "language_model.model.layers.0.feed_forward.shared_expert.down_proj",
332
  "language_model.model.layers.1.self_attn.q_proj",
333
  "language_model.model.layers.1.self_attn.k_proj",
334
  "language_model.model.layers.1.self_attn.v_proj",
335
  "language_model.model.layers.1.self_attn.o_proj",
336
  "language_model.model.layers.1.feed_forward.router",
337
- "language_model.model.layers.1.feed_forward.shared_expert.gate_proj",
338
- "language_model.model.layers.1.feed_forward.shared_expert.up_proj",
339
- "language_model.model.layers.1.feed_forward.shared_expert.down_proj",
340
  "language_model.model.layers.2.self_attn.q_proj",
341
  "language_model.model.layers.2.self_attn.k_proj",
342
  "language_model.model.layers.2.self_attn.v_proj",
343
  "language_model.model.layers.2.self_attn.o_proj",
344
  "language_model.model.layers.2.feed_forward.router",
345
- "language_model.model.layers.2.feed_forward.shared_expert.gate_proj",
346
- "language_model.model.layers.2.feed_forward.shared_expert.up_proj",
347
- "language_model.model.layers.2.feed_forward.shared_expert.down_proj",
348
  "language_model.model.layers.3.self_attn.q_proj",
349
  "language_model.model.layers.3.self_attn.k_proj",
350
  "language_model.model.layers.3.self_attn.v_proj",
351
  "language_model.model.layers.3.self_attn.o_proj",
352
  "language_model.model.layers.3.feed_forward.router",
353
- "language_model.model.layers.3.feed_forward.shared_expert.gate_proj",
354
- "language_model.model.layers.3.feed_forward.shared_expert.up_proj",
355
- "language_model.model.layers.3.feed_forward.shared_expert.down_proj",
356
  "language_model.model.layers.4.self_attn.q_proj",
357
  "language_model.model.layers.4.self_attn.k_proj",
358
  "language_model.model.layers.4.self_attn.v_proj",
359
  "language_model.model.layers.4.self_attn.o_proj",
360
  "language_model.model.layers.4.feed_forward.router",
361
- "language_model.model.layers.4.feed_forward.shared_expert.gate_proj",
362
- "language_model.model.layers.4.feed_forward.shared_expert.up_proj",
363
- "language_model.model.layers.4.feed_forward.shared_expert.down_proj",
364
  "language_model.model.layers.5.self_attn.q_proj",
365
  "language_model.model.layers.5.self_attn.k_proj",
366
  "language_model.model.layers.5.self_attn.v_proj",
367
  "language_model.model.layers.5.self_attn.o_proj",
368
  "language_model.model.layers.5.feed_forward.router",
369
- "language_model.model.layers.5.feed_forward.shared_expert.gate_proj",
370
- "language_model.model.layers.5.feed_forward.shared_expert.up_proj",
371
- "language_model.model.layers.5.feed_forward.shared_expert.down_proj",
372
  "language_model.model.layers.6.self_attn.q_proj",
373
  "language_model.model.layers.6.self_attn.k_proj",
374
  "language_model.model.layers.6.self_attn.v_proj",
375
  "language_model.model.layers.6.self_attn.o_proj",
376
  "language_model.model.layers.6.feed_forward.router",
377
- "language_model.model.layers.6.feed_forward.shared_expert.gate_proj",
378
- "language_model.model.layers.6.feed_forward.shared_expert.up_proj",
379
- "language_model.model.layers.6.feed_forward.shared_expert.down_proj",
380
  "language_model.model.layers.7.self_attn.q_proj",
381
  "language_model.model.layers.7.self_attn.k_proj",
382
  "language_model.model.layers.7.self_attn.v_proj",
383
  "language_model.model.layers.7.self_attn.o_proj",
384
  "language_model.model.layers.7.feed_forward.router",
385
- "language_model.model.layers.7.feed_forward.shared_expert.gate_proj",
386
- "language_model.model.layers.7.feed_forward.shared_expert.up_proj",
387
- "language_model.model.layers.7.feed_forward.shared_expert.down_proj",
388
  "language_model.model.layers.8.self_attn.q_proj",
389
  "language_model.model.layers.8.self_attn.k_proj",
390
  "language_model.model.layers.8.self_attn.v_proj",
391
  "language_model.model.layers.8.self_attn.o_proj",
392
  "language_model.model.layers.8.feed_forward.router",
393
- "language_model.model.layers.8.feed_forward.shared_expert.gate_proj",
394
- "language_model.model.layers.8.feed_forward.shared_expert.up_proj",
395
- "language_model.model.layers.8.feed_forward.shared_expert.down_proj",
396
  "language_model.model.layers.9.self_attn.q_proj",
397
  "language_model.model.layers.9.self_attn.k_proj",
398
  "language_model.model.layers.9.self_attn.v_proj",
399
  "language_model.model.layers.9.self_attn.o_proj",
400
  "language_model.model.layers.9.feed_forward.router",
401
- "language_model.model.layers.9.feed_forward.shared_expert.gate_proj",
402
- "language_model.model.layers.9.feed_forward.shared_expert.up_proj",
403
- "language_model.model.layers.9.feed_forward.shared_expert.down_proj",
404
  "language_model.model.layers.10.self_attn.q_proj",
405
  "language_model.model.layers.10.self_attn.k_proj",
406
  "language_model.model.layers.10.self_attn.v_proj",
407
  "language_model.model.layers.10.self_attn.o_proj",
408
  "language_model.model.layers.10.feed_forward.router",
409
- "language_model.model.layers.10.feed_forward.shared_expert.gate_proj",
410
- "language_model.model.layers.10.feed_forward.shared_expert.up_proj",
411
- "language_model.model.layers.10.feed_forward.shared_expert.down_proj",
412
  "language_model.model.layers.11.self_attn.q_proj",
413
  "language_model.model.layers.11.self_attn.k_proj",
414
  "language_model.model.layers.11.self_attn.v_proj",
415
  "language_model.model.layers.11.self_attn.o_proj",
416
  "language_model.model.layers.11.feed_forward.router",
417
- "language_model.model.layers.11.feed_forward.shared_expert.gate_proj",
418
- "language_model.model.layers.11.feed_forward.shared_expert.up_proj",
419
- "language_model.model.layers.11.feed_forward.shared_expert.down_proj",
420
  "language_model.model.layers.12.self_attn.q_proj",
421
  "language_model.model.layers.12.self_attn.k_proj",
422
  "language_model.model.layers.12.self_attn.v_proj",
423
  "language_model.model.layers.12.self_attn.o_proj",
424
  "language_model.model.layers.12.feed_forward.router",
425
- "language_model.model.layers.12.feed_forward.shared_expert.gate_proj",
426
- "language_model.model.layers.12.feed_forward.shared_expert.up_proj",
427
- "language_model.model.layers.12.feed_forward.shared_expert.down_proj",
428
  "language_model.model.layers.13.self_attn.q_proj",
429
  "language_model.model.layers.13.self_attn.k_proj",
430
  "language_model.model.layers.13.self_attn.v_proj",
431
  "language_model.model.layers.13.self_attn.o_proj",
432
  "language_model.model.layers.13.feed_forward.router",
433
- "language_model.model.layers.13.feed_forward.shared_expert.gate_proj",
434
- "language_model.model.layers.13.feed_forward.shared_expert.up_proj",
435
- "language_model.model.layers.13.feed_forward.shared_expert.down_proj",
436
  "language_model.model.layers.14.self_attn.q_proj",
437
  "language_model.model.layers.14.self_attn.k_proj",
438
  "language_model.model.layers.14.self_attn.v_proj",
439
  "language_model.model.layers.14.self_attn.o_proj",
440
  "language_model.model.layers.14.feed_forward.router",
441
- "language_model.model.layers.14.feed_forward.shared_expert.gate_proj",
442
- "language_model.model.layers.14.feed_forward.shared_expert.up_proj",
443
- "language_model.model.layers.14.feed_forward.shared_expert.down_proj",
444
  "language_model.model.layers.15.self_attn.q_proj",
445
  "language_model.model.layers.15.self_attn.k_proj",
446
  "language_model.model.layers.15.self_attn.v_proj",
447
  "language_model.model.layers.15.self_attn.o_proj",
448
  "language_model.model.layers.15.feed_forward.router",
449
- "language_model.model.layers.15.feed_forward.shared_expert.gate_proj",
450
- "language_model.model.layers.15.feed_forward.shared_expert.up_proj",
451
- "language_model.model.layers.15.feed_forward.shared_expert.down_proj",
452
  "language_model.model.layers.16.self_attn.q_proj",
453
  "language_model.model.layers.16.self_attn.k_proj",
454
  "language_model.model.layers.16.self_attn.v_proj",
455
  "language_model.model.layers.16.self_attn.o_proj",
456
  "language_model.model.layers.16.feed_forward.router",
457
- "language_model.model.layers.16.feed_forward.shared_expert.gate_proj",
458
- "language_model.model.layers.16.feed_forward.shared_expert.up_proj",
459
- "language_model.model.layers.16.feed_forward.shared_expert.down_proj",
460
  "language_model.model.layers.17.self_attn.q_proj",
461
  "language_model.model.layers.17.self_attn.k_proj",
462
  "language_model.model.layers.17.self_attn.v_proj",
463
  "language_model.model.layers.17.self_attn.o_proj",
464
  "language_model.model.layers.17.feed_forward.router",
465
- "language_model.model.layers.17.feed_forward.shared_expert.gate_proj",
466
- "language_model.model.layers.17.feed_forward.shared_expert.up_proj",
467
- "language_model.model.layers.17.feed_forward.shared_expert.down_proj",
468
  "language_model.model.layers.18.self_attn.q_proj",
469
  "language_model.model.layers.18.self_attn.k_proj",
470
  "language_model.model.layers.18.self_attn.v_proj",
471
  "language_model.model.layers.18.self_attn.o_proj",
472
  "language_model.model.layers.18.feed_forward.router",
473
- "language_model.model.layers.18.feed_forward.shared_expert.gate_proj",
474
- "language_model.model.layers.18.feed_forward.shared_expert.up_proj",
475
- "language_model.model.layers.18.feed_forward.shared_expert.down_proj",
476
  "language_model.model.layers.19.self_attn.q_proj",
477
  "language_model.model.layers.19.self_attn.k_proj",
478
  "language_model.model.layers.19.self_attn.v_proj",
479
  "language_model.model.layers.19.self_attn.o_proj",
480
  "language_model.model.layers.19.feed_forward.router",
481
- "language_model.model.layers.19.feed_forward.shared_expert.gate_proj",
482
- "language_model.model.layers.19.feed_forward.shared_expert.up_proj",
483
- "language_model.model.layers.19.feed_forward.shared_expert.down_proj",
484
  "language_model.model.layers.20.self_attn.q_proj",
485
  "language_model.model.layers.20.self_attn.k_proj",
486
  "language_model.model.layers.20.self_attn.v_proj",
487
  "language_model.model.layers.20.self_attn.o_proj",
488
  "language_model.model.layers.20.feed_forward.router",
489
- "language_model.model.layers.20.feed_forward.shared_expert.gate_proj",
490
- "language_model.model.layers.20.feed_forward.shared_expert.up_proj",
491
- "language_model.model.layers.20.feed_forward.shared_expert.down_proj",
492
  "language_model.model.layers.21.self_attn.q_proj",
493
  "language_model.model.layers.21.self_attn.k_proj",
494
  "language_model.model.layers.21.self_attn.v_proj",
495
  "language_model.model.layers.21.self_attn.o_proj",
496
  "language_model.model.layers.21.feed_forward.router",
497
- "language_model.model.layers.21.feed_forward.shared_expert.gate_proj",
498
- "language_model.model.layers.21.feed_forward.shared_expert.up_proj",
499
- "language_model.model.layers.21.feed_forward.shared_expert.down_proj",
500
  "language_model.model.layers.22.self_attn.q_proj",
501
  "language_model.model.layers.22.self_attn.k_proj",
502
  "language_model.model.layers.22.self_attn.v_proj",
503
  "language_model.model.layers.22.self_attn.o_proj",
504
  "language_model.model.layers.22.feed_forward.router",
505
- "language_model.model.layers.22.feed_forward.shared_expert.gate_proj",
506
- "language_model.model.layers.22.feed_forward.shared_expert.up_proj",
507
- "language_model.model.layers.22.feed_forward.shared_expert.down_proj",
508
  "language_model.model.layers.23.self_attn.q_proj",
509
  "language_model.model.layers.23.self_attn.k_proj",
510
  "language_model.model.layers.23.self_attn.v_proj",
511
  "language_model.model.layers.23.self_attn.o_proj",
512
  "language_model.model.layers.23.feed_forward.router",
513
- "language_model.model.layers.23.feed_forward.shared_expert.gate_proj",
514
- "language_model.model.layers.23.feed_forward.shared_expert.up_proj",
515
- "language_model.model.layers.23.feed_forward.shared_expert.down_proj",
516
  "language_model.model.layers.24.self_attn.q_proj",
517
  "language_model.model.layers.24.self_attn.k_proj",
518
  "language_model.model.layers.24.self_attn.v_proj",
519
  "language_model.model.layers.24.self_attn.o_proj",
520
  "language_model.model.layers.24.feed_forward.router",
521
- "language_model.model.layers.24.feed_forward.shared_expert.gate_proj",
522
- "language_model.model.layers.24.feed_forward.shared_expert.up_proj",
523
- "language_model.model.layers.24.feed_forward.shared_expert.down_proj",
524
  "language_model.model.layers.25.self_attn.q_proj",
525
  "language_model.model.layers.25.self_attn.k_proj",
526
  "language_model.model.layers.25.self_attn.v_proj",
527
  "language_model.model.layers.25.self_attn.o_proj",
528
  "language_model.model.layers.25.feed_forward.router",
529
- "language_model.model.layers.25.feed_forward.shared_expert.gate_proj",
530
- "language_model.model.layers.25.feed_forward.shared_expert.up_proj",
531
- "language_model.model.layers.25.feed_forward.shared_expert.down_proj",
532
  "language_model.model.layers.26.self_attn.q_proj",
533
  "language_model.model.layers.26.self_attn.k_proj",
534
  "language_model.model.layers.26.self_attn.v_proj",
535
  "language_model.model.layers.26.self_attn.o_proj",
536
  "language_model.model.layers.26.feed_forward.router",
537
- "language_model.model.layers.26.feed_forward.shared_expert.gate_proj",
538
- "language_model.model.layers.26.feed_forward.shared_expert.up_proj",
539
- "language_model.model.layers.26.feed_forward.shared_expert.down_proj",
540
  "language_model.model.layers.27.self_attn.q_proj",
541
  "language_model.model.layers.27.self_attn.k_proj",
542
  "language_model.model.layers.27.self_attn.v_proj",
543
  "language_model.model.layers.27.self_attn.o_proj",
544
  "language_model.model.layers.27.feed_forward.router",
545
- "language_model.model.layers.27.feed_forward.shared_expert.gate_proj",
546
- "language_model.model.layers.27.feed_forward.shared_expert.up_proj",
547
- "language_model.model.layers.27.feed_forward.shared_expert.down_proj",
548
  "language_model.model.layers.28.self_attn.q_proj",
549
  "language_model.model.layers.28.self_attn.k_proj",
550
  "language_model.model.layers.28.self_attn.v_proj",
551
  "language_model.model.layers.28.self_attn.o_proj",
552
  "language_model.model.layers.28.feed_forward.router",
553
- "language_model.model.layers.28.feed_forward.shared_expert.gate_proj",
554
- "language_model.model.layers.28.feed_forward.shared_expert.up_proj",
555
- "language_model.model.layers.28.feed_forward.shared_expert.down_proj",
556
  "language_model.model.layers.29.self_attn.q_proj",
557
  "language_model.model.layers.29.self_attn.k_proj",
558
  "language_model.model.layers.29.self_attn.v_proj",
559
  "language_model.model.layers.29.self_attn.o_proj",
560
  "language_model.model.layers.29.feed_forward.router",
561
- "language_model.model.layers.29.feed_forward.shared_expert.gate_proj",
562
- "language_model.model.layers.29.feed_forward.shared_expert.up_proj",
563
- "language_model.model.layers.29.feed_forward.shared_expert.down_proj",
564
  "language_model.model.layers.30.self_attn.q_proj",
565
  "language_model.model.layers.30.self_attn.k_proj",
566
  "language_model.model.layers.30.self_attn.v_proj",
567
  "language_model.model.layers.30.self_attn.o_proj",
568
  "language_model.model.layers.30.feed_forward.router",
569
- "language_model.model.layers.30.feed_forward.shared_expert.gate_proj",
570
- "language_model.model.layers.30.feed_forward.shared_expert.up_proj",
571
- "language_model.model.layers.30.feed_forward.shared_expert.down_proj",
572
  "language_model.model.layers.31.self_attn.q_proj",
573
  "language_model.model.layers.31.self_attn.k_proj",
574
  "language_model.model.layers.31.self_attn.v_proj",
575
  "language_model.model.layers.31.self_attn.o_proj",
576
  "language_model.model.layers.31.feed_forward.router",
577
- "language_model.model.layers.31.feed_forward.shared_expert.gate_proj",
578
- "language_model.model.layers.31.feed_forward.shared_expert.up_proj",
579
- "language_model.model.layers.31.feed_forward.shared_expert.down_proj",
580
  "language_model.model.layers.32.self_attn.q_proj",
581
  "language_model.model.layers.32.self_attn.k_proj",
582
  "language_model.model.layers.32.self_attn.v_proj",
583
  "language_model.model.layers.32.self_attn.o_proj",
584
  "language_model.model.layers.32.feed_forward.router",
585
- "language_model.model.layers.32.feed_forward.shared_expert.gate_proj",
586
- "language_model.model.layers.32.feed_forward.shared_expert.up_proj",
587
- "language_model.model.layers.32.feed_forward.shared_expert.down_proj",
588
  "language_model.model.layers.33.self_attn.q_proj",
589
  "language_model.model.layers.33.self_attn.k_proj",
590
  "language_model.model.layers.33.self_attn.v_proj",
591
  "language_model.model.layers.33.self_attn.o_proj",
592
  "language_model.model.layers.33.feed_forward.router",
593
- "language_model.model.layers.33.feed_forward.shared_expert.gate_proj",
594
- "language_model.model.layers.33.feed_forward.shared_expert.up_proj",
595
- "language_model.model.layers.33.feed_forward.shared_expert.down_proj",
596
  "language_model.model.layers.34.self_attn.q_proj",
597
  "language_model.model.layers.34.self_attn.k_proj",
598
  "language_model.model.layers.34.self_attn.v_proj",
599
  "language_model.model.layers.34.self_attn.o_proj",
600
  "language_model.model.layers.34.feed_forward.router",
601
- "language_model.model.layers.34.feed_forward.shared_expert.gate_proj",
602
- "language_model.model.layers.34.feed_forward.shared_expert.up_proj",
603
- "language_model.model.layers.34.feed_forward.shared_expert.down_proj",
604
  "language_model.model.layers.35.self_attn.q_proj",
605
  "language_model.model.layers.35.self_attn.k_proj",
606
  "language_model.model.layers.35.self_attn.v_proj",
607
  "language_model.model.layers.35.self_attn.o_proj",
608
  "language_model.model.layers.35.feed_forward.router",
609
- "language_model.model.layers.35.feed_forward.shared_expert.gate_proj",
610
- "language_model.model.layers.35.feed_forward.shared_expert.up_proj",
611
- "language_model.model.layers.35.feed_forward.shared_expert.down_proj",
612
  "language_model.model.layers.36.self_attn.q_proj",
613
  "language_model.model.layers.36.self_attn.k_proj",
614
  "language_model.model.layers.36.self_attn.v_proj",
615
  "language_model.model.layers.36.self_attn.o_proj",
616
  "language_model.model.layers.36.feed_forward.router",
617
- "language_model.model.layers.36.feed_forward.shared_expert.gate_proj",
618
- "language_model.model.layers.36.feed_forward.shared_expert.up_proj",
619
- "language_model.model.layers.36.feed_forward.shared_expert.down_proj",
620
  "language_model.model.layers.37.self_attn.q_proj",
621
  "language_model.model.layers.37.self_attn.k_proj",
622
  "language_model.model.layers.37.self_attn.v_proj",
623
  "language_model.model.layers.37.self_attn.o_proj",
624
  "language_model.model.layers.37.feed_forward.router",
625
- "language_model.model.layers.37.feed_forward.shared_expert.gate_proj",
626
- "language_model.model.layers.37.feed_forward.shared_expert.up_proj",
627
- "language_model.model.layers.37.feed_forward.shared_expert.down_proj",
628
  "language_model.model.layers.38.self_attn.q_proj",
629
  "language_model.model.layers.38.self_attn.k_proj",
630
  "language_model.model.layers.38.self_attn.v_proj",
631
  "language_model.model.layers.38.self_attn.o_proj",
632
  "language_model.model.layers.38.feed_forward.router",
633
- "language_model.model.layers.38.feed_forward.shared_expert.gate_proj",
634
- "language_model.model.layers.38.feed_forward.shared_expert.up_proj",
635
- "language_model.model.layers.38.feed_forward.shared_expert.down_proj",
636
  "language_model.model.layers.39.self_attn.q_proj",
637
  "language_model.model.layers.39.self_attn.k_proj",
638
  "language_model.model.layers.39.self_attn.v_proj",
639
  "language_model.model.layers.39.self_attn.o_proj",
640
  "language_model.model.layers.39.feed_forward.router",
641
- "language_model.model.layers.39.feed_forward.shared_expert.gate_proj",
642
- "language_model.model.layers.39.feed_forward.shared_expert.up_proj",
643
- "language_model.model.layers.39.feed_forward.shared_expert.down_proj",
644
  "language_model.model.layers.40.self_attn.q_proj",
645
  "language_model.model.layers.40.self_attn.k_proj",
646
  "language_model.model.layers.40.self_attn.v_proj",
647
  "language_model.model.layers.40.self_attn.o_proj",
648
  "language_model.model.layers.40.feed_forward.router",
649
- "language_model.model.layers.40.feed_forward.shared_expert.gate_proj",
650
- "language_model.model.layers.40.feed_forward.shared_expert.up_proj",
651
- "language_model.model.layers.40.feed_forward.shared_expert.down_proj",
652
  "language_model.model.layers.41.self_attn.q_proj",
653
  "language_model.model.layers.41.self_attn.k_proj",
654
  "language_model.model.layers.41.self_attn.v_proj",
655
  "language_model.model.layers.41.self_attn.o_proj",
656
  "language_model.model.layers.41.feed_forward.router",
657
- "language_model.model.layers.41.feed_forward.shared_expert.gate_proj",
658
- "language_model.model.layers.41.feed_forward.shared_expert.up_proj",
659
- "language_model.model.layers.41.feed_forward.shared_expert.down_proj",
660
  "language_model.model.layers.42.self_attn.q_proj",
661
  "language_model.model.layers.42.self_attn.k_proj",
662
  "language_model.model.layers.42.self_attn.v_proj",
663
  "language_model.model.layers.42.self_attn.o_proj",
664
  "language_model.model.layers.42.feed_forward.router",
665
- "language_model.model.layers.42.feed_forward.shared_expert.gate_proj",
666
- "language_model.model.layers.42.feed_forward.shared_expert.up_proj",
667
- "language_model.model.layers.42.feed_forward.shared_expert.down_proj",
668
  "language_model.model.layers.43.self_attn.q_proj",
669
  "language_model.model.layers.43.self_attn.k_proj",
670
  "language_model.model.layers.43.self_attn.v_proj",
671
  "language_model.model.layers.43.self_attn.o_proj",
672
  "language_model.model.layers.43.feed_forward.router",
673
- "language_model.model.layers.43.feed_forward.shared_expert.gate_proj",
674
- "language_model.model.layers.43.feed_forward.shared_expert.up_proj",
675
- "language_model.model.layers.43.feed_forward.shared_expert.down_proj",
676
  "language_model.model.layers.44.self_attn.q_proj",
677
  "language_model.model.layers.44.self_attn.k_proj",
678
  "language_model.model.layers.44.self_attn.v_proj",
679
  "language_model.model.layers.44.self_attn.o_proj",
680
  "language_model.model.layers.44.feed_forward.router",
681
- "language_model.model.layers.44.feed_forward.shared_expert.gate_proj",
682
- "language_model.model.layers.44.feed_forward.shared_expert.up_proj",
683
- "language_model.model.layers.44.feed_forward.shared_expert.down_proj",
684
  "language_model.model.layers.45.self_attn.q_proj",
685
  "language_model.model.layers.45.self_attn.k_proj",
686
  "language_model.model.layers.45.self_attn.v_proj",
687
  "language_model.model.layers.45.self_attn.o_proj",
688
  "language_model.model.layers.45.feed_forward.router",
689
- "language_model.model.layers.45.feed_forward.shared_expert.gate_proj",
690
- "language_model.model.layers.45.feed_forward.shared_expert.up_proj",
691
- "language_model.model.layers.45.feed_forward.shared_expert.down_proj",
692
  "language_model.model.layers.46.self_attn.q_proj",
693
  "language_model.model.layers.46.self_attn.k_proj",
694
  "language_model.model.layers.46.self_attn.v_proj",
695
  "language_model.model.layers.46.self_attn.o_proj",
696
  "language_model.model.layers.46.feed_forward.router",
697
- "language_model.model.layers.46.feed_forward.shared_expert.gate_proj",
698
- "language_model.model.layers.46.feed_forward.shared_expert.up_proj",
699
- "language_model.model.layers.46.feed_forward.shared_expert.down_proj",
700
  "language_model.model.layers.47.self_attn.q_proj",
701
  "language_model.model.layers.47.self_attn.k_proj",
702
  "language_model.model.layers.47.self_attn.v_proj",
703
  "language_model.model.layers.47.self_attn.o_proj",
704
  "language_model.model.layers.47.feed_forward.router",
705
- "language_model.model.layers.47.feed_forward.shared_expert.gate_proj",
706
- "language_model.model.layers.47.feed_forward.shared_expert.up_proj",
707
- "language_model.model.layers.47.feed_forward.shared_expert.down_proj",
708
  "language_model.lm_head"
709
  ],
710
  "kv_cache_scheme": null,
 
326
  "language_model.model.layers.0.self_attn.v_proj",
327
  "language_model.model.layers.0.self_attn.o_proj",
328
  "language_model.model.layers.0.feed_forward.router",
 
 
 
329
  "language_model.model.layers.1.self_attn.q_proj",
330
  "language_model.model.layers.1.self_attn.k_proj",
331
  "language_model.model.layers.1.self_attn.v_proj",
332
  "language_model.model.layers.1.self_attn.o_proj",
333
  "language_model.model.layers.1.feed_forward.router",
 
 
 
334
  "language_model.model.layers.2.self_attn.q_proj",
335
  "language_model.model.layers.2.self_attn.k_proj",
336
  "language_model.model.layers.2.self_attn.v_proj",
337
  "language_model.model.layers.2.self_attn.o_proj",
338
  "language_model.model.layers.2.feed_forward.router",
 
 
 
339
  "language_model.model.layers.3.self_attn.q_proj",
340
  "language_model.model.layers.3.self_attn.k_proj",
341
  "language_model.model.layers.3.self_attn.v_proj",
342
  "language_model.model.layers.3.self_attn.o_proj",
343
  "language_model.model.layers.3.feed_forward.router",
 
 
 
344
  "language_model.model.layers.4.self_attn.q_proj",
345
  "language_model.model.layers.4.self_attn.k_proj",
346
  "language_model.model.layers.4.self_attn.v_proj",
347
  "language_model.model.layers.4.self_attn.o_proj",
348
  "language_model.model.layers.4.feed_forward.router",
 
 
 
349
  "language_model.model.layers.5.self_attn.q_proj",
350
  "language_model.model.layers.5.self_attn.k_proj",
351
  "language_model.model.layers.5.self_attn.v_proj",
352
  "language_model.model.layers.5.self_attn.o_proj",
353
  "language_model.model.layers.5.feed_forward.router",
 
 
 
354
  "language_model.model.layers.6.self_attn.q_proj",
355
  "language_model.model.layers.6.self_attn.k_proj",
356
  "language_model.model.layers.6.self_attn.v_proj",
357
  "language_model.model.layers.6.self_attn.o_proj",
358
  "language_model.model.layers.6.feed_forward.router",
 
 
 
359
  "language_model.model.layers.7.self_attn.q_proj",
360
  "language_model.model.layers.7.self_attn.k_proj",
361
  "language_model.model.layers.7.self_attn.v_proj",
362
  "language_model.model.layers.7.self_attn.o_proj",
363
  "language_model.model.layers.7.feed_forward.router",
 
 
 
364
  "language_model.model.layers.8.self_attn.q_proj",
365
  "language_model.model.layers.8.self_attn.k_proj",
366
  "language_model.model.layers.8.self_attn.v_proj",
367
  "language_model.model.layers.8.self_attn.o_proj",
368
  "language_model.model.layers.8.feed_forward.router",
 
 
 
369
  "language_model.model.layers.9.self_attn.q_proj",
370
  "language_model.model.layers.9.self_attn.k_proj",
371
  "language_model.model.layers.9.self_attn.v_proj",
372
  "language_model.model.layers.9.self_attn.o_proj",
373
  "language_model.model.layers.9.feed_forward.router",
 
 
 
374
  "language_model.model.layers.10.self_attn.q_proj",
375
  "language_model.model.layers.10.self_attn.k_proj",
376
  "language_model.model.layers.10.self_attn.v_proj",
377
  "language_model.model.layers.10.self_attn.o_proj",
378
  "language_model.model.layers.10.feed_forward.router",
 
 
 
379
  "language_model.model.layers.11.self_attn.q_proj",
380
  "language_model.model.layers.11.self_attn.k_proj",
381
  "language_model.model.layers.11.self_attn.v_proj",
382
  "language_model.model.layers.11.self_attn.o_proj",
383
  "language_model.model.layers.11.feed_forward.router",
 
 
 
384
  "language_model.model.layers.12.self_attn.q_proj",
385
  "language_model.model.layers.12.self_attn.k_proj",
386
  "language_model.model.layers.12.self_attn.v_proj",
387
  "language_model.model.layers.12.self_attn.o_proj",
388
  "language_model.model.layers.12.feed_forward.router",
 
 
 
389
  "language_model.model.layers.13.self_attn.q_proj",
390
  "language_model.model.layers.13.self_attn.k_proj",
391
  "language_model.model.layers.13.self_attn.v_proj",
392
  "language_model.model.layers.13.self_attn.o_proj",
393
  "language_model.model.layers.13.feed_forward.router",
 
 
 
394
  "language_model.model.layers.14.self_attn.q_proj",
395
  "language_model.model.layers.14.self_attn.k_proj",
396
  "language_model.model.layers.14.self_attn.v_proj",
397
  "language_model.model.layers.14.self_attn.o_proj",
398
  "language_model.model.layers.14.feed_forward.router",
 
 
 
399
  "language_model.model.layers.15.self_attn.q_proj",
400
  "language_model.model.layers.15.self_attn.k_proj",
401
  "language_model.model.layers.15.self_attn.v_proj",
402
  "language_model.model.layers.15.self_attn.o_proj",
403
  "language_model.model.layers.15.feed_forward.router",
 
 
 
404
  "language_model.model.layers.16.self_attn.q_proj",
405
  "language_model.model.layers.16.self_attn.k_proj",
406
  "language_model.model.layers.16.self_attn.v_proj",
407
  "language_model.model.layers.16.self_attn.o_proj",
408
  "language_model.model.layers.16.feed_forward.router",
 
 
 
409
  "language_model.model.layers.17.self_attn.q_proj",
410
  "language_model.model.layers.17.self_attn.k_proj",
411
  "language_model.model.layers.17.self_attn.v_proj",
412
  "language_model.model.layers.17.self_attn.o_proj",
413
  "language_model.model.layers.17.feed_forward.router",
 
 
 
414
  "language_model.model.layers.18.self_attn.q_proj",
415
  "language_model.model.layers.18.self_attn.k_proj",
416
  "language_model.model.layers.18.self_attn.v_proj",
417
  "language_model.model.layers.18.self_attn.o_proj",
418
  "language_model.model.layers.18.feed_forward.router",
 
 
 
419
  "language_model.model.layers.19.self_attn.q_proj",
420
  "language_model.model.layers.19.self_attn.k_proj",
421
  "language_model.model.layers.19.self_attn.v_proj",
422
  "language_model.model.layers.19.self_attn.o_proj",
423
  "language_model.model.layers.19.feed_forward.router",
 
 
 
424
  "language_model.model.layers.20.self_attn.q_proj",
425
  "language_model.model.layers.20.self_attn.k_proj",
426
  "language_model.model.layers.20.self_attn.v_proj",
427
  "language_model.model.layers.20.self_attn.o_proj",
428
  "language_model.model.layers.20.feed_forward.router",
 
 
 
429
  "language_model.model.layers.21.self_attn.q_proj",
430
  "language_model.model.layers.21.self_attn.k_proj",
431
  "language_model.model.layers.21.self_attn.v_proj",
432
  "language_model.model.layers.21.self_attn.o_proj",
433
  "language_model.model.layers.21.feed_forward.router",
 
 
 
434
  "language_model.model.layers.22.self_attn.q_proj",
435
  "language_model.model.layers.22.self_attn.k_proj",
436
  "language_model.model.layers.22.self_attn.v_proj",
437
  "language_model.model.layers.22.self_attn.o_proj",
438
  "language_model.model.layers.22.feed_forward.router",
 
 
 
439
  "language_model.model.layers.23.self_attn.q_proj",
440
  "language_model.model.layers.23.self_attn.k_proj",
441
  "language_model.model.layers.23.self_attn.v_proj",
442
  "language_model.model.layers.23.self_attn.o_proj",
443
  "language_model.model.layers.23.feed_forward.router",
 
 
 
444
  "language_model.model.layers.24.self_attn.q_proj",
445
  "language_model.model.layers.24.self_attn.k_proj",
446
  "language_model.model.layers.24.self_attn.v_proj",
447
  "language_model.model.layers.24.self_attn.o_proj",
448
  "language_model.model.layers.24.feed_forward.router",
 
 
 
449
  "language_model.model.layers.25.self_attn.q_proj",
450
  "language_model.model.layers.25.self_attn.k_proj",
451
  "language_model.model.layers.25.self_attn.v_proj",
452
  "language_model.model.layers.25.self_attn.o_proj",
453
  "language_model.model.layers.25.feed_forward.router",
 
 
 
454
  "language_model.model.layers.26.self_attn.q_proj",
455
  "language_model.model.layers.26.self_attn.k_proj",
456
  "language_model.model.layers.26.self_attn.v_proj",
457
  "language_model.model.layers.26.self_attn.o_proj",
458
  "language_model.model.layers.26.feed_forward.router",
 
 
 
459
  "language_model.model.layers.27.self_attn.q_proj",
460
  "language_model.model.layers.27.self_attn.k_proj",
461
  "language_model.model.layers.27.self_attn.v_proj",
462
  "language_model.model.layers.27.self_attn.o_proj",
463
  "language_model.model.layers.27.feed_forward.router",
 
 
 
464
  "language_model.model.layers.28.self_attn.q_proj",
465
  "language_model.model.layers.28.self_attn.k_proj",
466
  "language_model.model.layers.28.self_attn.v_proj",
467
  "language_model.model.layers.28.self_attn.o_proj",
468
  "language_model.model.layers.28.feed_forward.router",
 
 
 
469
  "language_model.model.layers.29.self_attn.q_proj",
470
  "language_model.model.layers.29.self_attn.k_proj",
471
  "language_model.model.layers.29.self_attn.v_proj",
472
  "language_model.model.layers.29.self_attn.o_proj",
473
  "language_model.model.layers.29.feed_forward.router",
 
 
 
474
  "language_model.model.layers.30.self_attn.q_proj",
475
  "language_model.model.layers.30.self_attn.k_proj",
476
  "language_model.model.layers.30.self_attn.v_proj",
477
  "language_model.model.layers.30.self_attn.o_proj",
478
  "language_model.model.layers.30.feed_forward.router",
 
 
 
479
  "language_model.model.layers.31.self_attn.q_proj",
480
  "language_model.model.layers.31.self_attn.k_proj",
481
  "language_model.model.layers.31.self_attn.v_proj",
482
  "language_model.model.layers.31.self_attn.o_proj",
483
  "language_model.model.layers.31.feed_forward.router",
 
 
 
484
  "language_model.model.layers.32.self_attn.q_proj",
485
  "language_model.model.layers.32.self_attn.k_proj",
486
  "language_model.model.layers.32.self_attn.v_proj",
487
  "language_model.model.layers.32.self_attn.o_proj",
488
  "language_model.model.layers.32.feed_forward.router",
 
 
 
489
  "language_model.model.layers.33.self_attn.q_proj",
490
  "language_model.model.layers.33.self_attn.k_proj",
491
  "language_model.model.layers.33.self_attn.v_proj",
492
  "language_model.model.layers.33.self_attn.o_proj",
493
  "language_model.model.layers.33.feed_forward.router",
 
 
 
494
  "language_model.model.layers.34.self_attn.q_proj",
495
  "language_model.model.layers.34.self_attn.k_proj",
496
  "language_model.model.layers.34.self_attn.v_proj",
497
  "language_model.model.layers.34.self_attn.o_proj",
498
  "language_model.model.layers.34.feed_forward.router",
 
 
 
499
  "language_model.model.layers.35.self_attn.q_proj",
500
  "language_model.model.layers.35.self_attn.k_proj",
501
  "language_model.model.layers.35.self_attn.v_proj",
502
  "language_model.model.layers.35.self_attn.o_proj",
503
  "language_model.model.layers.35.feed_forward.router",
 
 
 
504
  "language_model.model.layers.36.self_attn.q_proj",
505
  "language_model.model.layers.36.self_attn.k_proj",
506
  "language_model.model.layers.36.self_attn.v_proj",
507
  "language_model.model.layers.36.self_attn.o_proj",
508
  "language_model.model.layers.36.feed_forward.router",
 
 
 
509
  "language_model.model.layers.37.self_attn.q_proj",
510
  "language_model.model.layers.37.self_attn.k_proj",
511
  "language_model.model.layers.37.self_attn.v_proj",
512
  "language_model.model.layers.37.self_attn.o_proj",
513
  "language_model.model.layers.37.feed_forward.router",
 
 
 
514
  "language_model.model.layers.38.self_attn.q_proj",
515
  "language_model.model.layers.38.self_attn.k_proj",
516
  "language_model.model.layers.38.self_attn.v_proj",
517
  "language_model.model.layers.38.self_attn.o_proj",
518
  "language_model.model.layers.38.feed_forward.router",
 
 
 
519
  "language_model.model.layers.39.self_attn.q_proj",
520
  "language_model.model.layers.39.self_attn.k_proj",
521
  "language_model.model.layers.39.self_attn.v_proj",
522
  "language_model.model.layers.39.self_attn.o_proj",
523
  "language_model.model.layers.39.feed_forward.router",
 
 
 
524
  "language_model.model.layers.40.self_attn.q_proj",
525
  "language_model.model.layers.40.self_attn.k_proj",
526
  "language_model.model.layers.40.self_attn.v_proj",
527
  "language_model.model.layers.40.self_attn.o_proj",
528
  "language_model.model.layers.40.feed_forward.router",
 
 
 
529
  "language_model.model.layers.41.self_attn.q_proj",
530
  "language_model.model.layers.41.self_attn.k_proj",
531
  "language_model.model.layers.41.self_attn.v_proj",
532
  "language_model.model.layers.41.self_attn.o_proj",
533
  "language_model.model.layers.41.feed_forward.router",
 
 
 
534
  "language_model.model.layers.42.self_attn.q_proj",
535
  "language_model.model.layers.42.self_attn.k_proj",
536
  "language_model.model.layers.42.self_attn.v_proj",
537
  "language_model.model.layers.42.self_attn.o_proj",
538
  "language_model.model.layers.42.feed_forward.router",
 
 
 
539
  "language_model.model.layers.43.self_attn.q_proj",
540
  "language_model.model.layers.43.self_attn.k_proj",
541
  "language_model.model.layers.43.self_attn.v_proj",
542
  "language_model.model.layers.43.self_attn.o_proj",
543
  "language_model.model.layers.43.feed_forward.router",
 
 
 
544
  "language_model.model.layers.44.self_attn.q_proj",
545
  "language_model.model.layers.44.self_attn.k_proj",
546
  "language_model.model.layers.44.self_attn.v_proj",
547
  "language_model.model.layers.44.self_attn.o_proj",
548
  "language_model.model.layers.44.feed_forward.router",
 
 
 
549
  "language_model.model.layers.45.self_attn.q_proj",
550
  "language_model.model.layers.45.self_attn.k_proj",
551
  "language_model.model.layers.45.self_attn.v_proj",
552
  "language_model.model.layers.45.self_attn.o_proj",
553
  "language_model.model.layers.45.feed_forward.router",
 
 
 
554
  "language_model.model.layers.46.self_attn.q_proj",
555
  "language_model.model.layers.46.self_attn.k_proj",
556
  "language_model.model.layers.46.self_attn.v_proj",
557
  "language_model.model.layers.46.self_attn.o_proj",
558
  "language_model.model.layers.46.feed_forward.router",
 
 
 
559
  "language_model.model.layers.47.self_attn.q_proj",
560
  "language_model.model.layers.47.self_attn.k_proj",
561
  "language_model.model.layers.47.self_attn.v_proj",
562
  "language_model.model.layers.47.self_attn.o_proj",
563
  "language_model.model.layers.47.feed_forward.router",
 
 
 
564
  "language_model.lm_head"
565
  ],
566
  "kv_cache_scheme": null,
config.json.original DELETED
@@ -1,817 +0,0 @@
1
- {
2
- "architectures": [
3
- "Llama4ForConditionalGeneration"
4
- ],
5
- "boi_token_index": 200080,
6
- "eoi_token_index": 200081,
7
- "image_token_index": 200092,
8
- "model_type": "llama4",
9
- "quantization_config": {
10
- "config_groups": {
11
- "group_0": {
12
- "input_activations": {
13
- "actorder": null,
14
- "block_structure": null,
15
- "dynamic": true,
16
- "group_size": null,
17
- "num_bits": 8,
18
- "observer": null,
19
- "observer_kwargs": {},
20
- "strategy": "token",
21
- "symmetric": true,
22
- "type": "float"
23
- },
24
- "output_activations": null,
25
- "targets": [
26
- "Linear"
27
- ],
28
- "weights": {
29
- "actorder": null,
30
- "block_structure": null,
31
- "dynamic": false,
32
- "group_size": null,
33
- "num_bits": 8,
34
- "observer": "mse",
35
- "observer_kwargs": {},
36
- "strategy": "channel",
37
- "symmetric": true,
38
- "type": "float"
39
- }
40
- }
41
- },
42
- "format": "float-quantized",
43
- "global_compression_ratio": null,
44
- "ignore": [
45
- "vision_model.patch_embedding.linear",
46
- "vision_model.model.layers.0.self_attn.q_proj",
47
- "vision_model.model.layers.0.self_attn.k_proj",
48
- "vision_model.model.layers.0.self_attn.v_proj",
49
- "vision_model.model.layers.0.self_attn.o_proj",
50
- "vision_model.model.layers.0.mlp.fc1",
51
- "vision_model.model.layers.0.mlp.fc2",
52
- "vision_model.model.layers.1.self_attn.q_proj",
53
- "vision_model.model.layers.1.self_attn.k_proj",
54
- "vision_model.model.layers.1.self_attn.v_proj",
55
- "vision_model.model.layers.1.self_attn.o_proj",
56
- "vision_model.model.layers.1.mlp.fc1",
57
- "vision_model.model.layers.1.mlp.fc2",
58
- "vision_model.model.layers.2.self_attn.q_proj",
59
- "vision_model.model.layers.2.self_attn.k_proj",
60
- "vision_model.model.layers.2.self_attn.v_proj",
61
- "vision_model.model.layers.2.self_attn.o_proj",
62
- "vision_model.model.layers.2.mlp.fc1",
63
- "vision_model.model.layers.2.mlp.fc2",
64
- "vision_model.model.layers.3.self_attn.q_proj",
65
- "vision_model.model.layers.3.self_attn.k_proj",
66
- "vision_model.model.layers.3.self_attn.v_proj",
67
- "vision_model.model.layers.3.self_attn.o_proj",
68
- "vision_model.model.layers.3.mlp.fc1",
69
- "vision_model.model.layers.3.mlp.fc2",
70
- "vision_model.model.layers.4.self_attn.q_proj",
71
- "vision_model.model.layers.4.self_attn.k_proj",
72
- "vision_model.model.layers.4.self_attn.v_proj",
73
- "vision_model.model.layers.4.self_attn.o_proj",
74
- "vision_model.model.layers.4.mlp.fc1",
75
- "vision_model.model.layers.4.mlp.fc2",
76
- "vision_model.model.layers.5.self_attn.q_proj",
77
- "vision_model.model.layers.5.self_attn.k_proj",
78
- "vision_model.model.layers.5.self_attn.v_proj",
79
- "vision_model.model.layers.5.self_attn.o_proj",
80
- "vision_model.model.layers.5.mlp.fc1",
81
- "vision_model.model.layers.5.mlp.fc2",
82
- "vision_model.model.layers.6.self_attn.q_proj",
83
- "vision_model.model.layers.6.self_attn.k_proj",
84
- "vision_model.model.layers.6.self_attn.v_proj",
85
- "vision_model.model.layers.6.self_attn.o_proj",
86
- "vision_model.model.layers.6.mlp.fc1",
87
- "vision_model.model.layers.6.mlp.fc2",
88
- "vision_model.model.layers.7.self_attn.q_proj",
89
- "vision_model.model.layers.7.self_attn.k_proj",
90
- "vision_model.model.layers.7.self_attn.v_proj",
91
- "vision_model.model.layers.7.self_attn.o_proj",
92
- "vision_model.model.layers.7.mlp.fc1",
93
- "vision_model.model.layers.7.mlp.fc2",
94
- "vision_model.model.layers.8.self_attn.q_proj",
95
- "vision_model.model.layers.8.self_attn.k_proj",
96
- "vision_model.model.layers.8.self_attn.v_proj",
97
- "vision_model.model.layers.8.self_attn.o_proj",
98
- "vision_model.model.layers.8.mlp.fc1",
99
- "vision_model.model.layers.8.mlp.fc2",
100
- "vision_model.model.layers.9.self_attn.q_proj",
101
- "vision_model.model.layers.9.self_attn.k_proj",
102
- "vision_model.model.layers.9.self_attn.v_proj",
103
- "vision_model.model.layers.9.self_attn.o_proj",
104
- "vision_model.model.layers.9.mlp.fc1",
105
- "vision_model.model.layers.9.mlp.fc2",
106
- "vision_model.model.layers.10.self_attn.q_proj",
107
- "vision_model.model.layers.10.self_attn.k_proj",
108
- "vision_model.model.layers.10.self_attn.v_proj",
109
- "vision_model.model.layers.10.self_attn.o_proj",
110
- "vision_model.model.layers.10.mlp.fc1",
111
- "vision_model.model.layers.10.mlp.fc2",
112
- "vision_model.model.layers.11.self_attn.q_proj",
113
- "vision_model.model.layers.11.self_attn.k_proj",
114
- "vision_model.model.layers.11.self_attn.v_proj",
115
- "vision_model.model.layers.11.self_attn.o_proj",
116
- "vision_model.model.layers.11.mlp.fc1",
117
- "vision_model.model.layers.11.mlp.fc2",
118
- "vision_model.model.layers.12.self_attn.q_proj",
119
- "vision_model.model.layers.12.self_attn.k_proj",
120
- "vision_model.model.layers.12.self_attn.v_proj",
121
- "vision_model.model.layers.12.self_attn.o_proj",
122
- "vision_model.model.layers.12.mlp.fc1",
123
- "vision_model.model.layers.12.mlp.fc2",
124
- "vision_model.model.layers.13.self_attn.q_proj",
125
- "vision_model.model.layers.13.self_attn.k_proj",
126
- "vision_model.model.layers.13.self_attn.v_proj",
127
- "vision_model.model.layers.13.self_attn.o_proj",
128
- "vision_model.model.layers.13.mlp.fc1",
129
- "vision_model.model.layers.13.mlp.fc2",
130
- "vision_model.model.layers.14.self_attn.q_proj",
131
- "vision_model.model.layers.14.self_attn.k_proj",
132
- "vision_model.model.layers.14.self_attn.v_proj",
133
- "vision_model.model.layers.14.self_attn.o_proj",
134
- "vision_model.model.layers.14.mlp.fc1",
135
- "vision_model.model.layers.14.mlp.fc2",
136
- "vision_model.model.layers.15.self_attn.q_proj",
137
- "vision_model.model.layers.15.self_attn.k_proj",
138
- "vision_model.model.layers.15.self_attn.v_proj",
139
- "vision_model.model.layers.15.self_attn.o_proj",
140
- "vision_model.model.layers.15.mlp.fc1",
141
- "vision_model.model.layers.15.mlp.fc2",
142
- "vision_model.model.layers.16.self_attn.q_proj",
143
- "vision_model.model.layers.16.self_attn.k_proj",
144
- "vision_model.model.layers.16.self_attn.v_proj",
145
- "vision_model.model.layers.16.self_attn.o_proj",
146
- "vision_model.model.layers.16.mlp.fc1",
147
- "vision_model.model.layers.16.mlp.fc2",
148
- "vision_model.model.layers.17.self_attn.q_proj",
149
- "vision_model.model.layers.17.self_attn.k_proj",
150
- "vision_model.model.layers.17.self_attn.v_proj",
151
- "vision_model.model.layers.17.self_attn.o_proj",
152
- "vision_model.model.layers.17.mlp.fc1",
153
- "vision_model.model.layers.17.mlp.fc2",
154
- "vision_model.model.layers.18.self_attn.q_proj",
155
- "vision_model.model.layers.18.self_attn.k_proj",
156
- "vision_model.model.layers.18.self_attn.v_proj",
157
- "vision_model.model.layers.18.self_attn.o_proj",
158
- "vision_model.model.layers.18.mlp.fc1",
159
- "vision_model.model.layers.18.mlp.fc2",
160
- "vision_model.model.layers.19.self_attn.q_proj",
161
- "vision_model.model.layers.19.self_attn.k_proj",
162
- "vision_model.model.layers.19.self_attn.v_proj",
163
- "vision_model.model.layers.19.self_attn.o_proj",
164
- "vision_model.model.layers.19.mlp.fc1",
165
- "vision_model.model.layers.19.mlp.fc2",
166
- "vision_model.model.layers.20.self_attn.q_proj",
167
- "vision_model.model.layers.20.self_attn.k_proj",
168
- "vision_model.model.layers.20.self_attn.v_proj",
169
- "vision_model.model.layers.20.self_attn.o_proj",
170
- "vision_model.model.layers.20.mlp.fc1",
171
- "vision_model.model.layers.20.mlp.fc2",
172
- "vision_model.model.layers.21.self_attn.q_proj",
173
- "vision_model.model.layers.21.self_attn.k_proj",
174
- "vision_model.model.layers.21.self_attn.v_proj",
175
- "vision_model.model.layers.21.self_attn.o_proj",
176
- "vision_model.model.layers.21.mlp.fc1",
177
- "vision_model.model.layers.21.mlp.fc2",
178
- "vision_model.model.layers.22.self_attn.q_proj",
179
- "vision_model.model.layers.22.self_attn.k_proj",
180
- "vision_model.model.layers.22.self_attn.v_proj",
181
- "vision_model.model.layers.22.self_attn.o_proj",
182
- "vision_model.model.layers.22.mlp.fc1",
183
- "vision_model.model.layers.22.mlp.fc2",
184
- "vision_model.model.layers.23.self_attn.q_proj",
185
- "vision_model.model.layers.23.self_attn.k_proj",
186
- "vision_model.model.layers.23.self_attn.v_proj",
187
- "vision_model.model.layers.23.self_attn.o_proj",
188
- "vision_model.model.layers.23.mlp.fc1",
189
- "vision_model.model.layers.23.mlp.fc2",
190
- "vision_model.model.layers.24.self_attn.q_proj",
191
- "vision_model.model.layers.24.self_attn.k_proj",
192
- "vision_model.model.layers.24.self_attn.v_proj",
193
- "vision_model.model.layers.24.self_attn.o_proj",
194
- "vision_model.model.layers.24.mlp.fc1",
195
- "vision_model.model.layers.24.mlp.fc2",
196
- "vision_model.model.layers.25.self_attn.q_proj",
197
- "vision_model.model.layers.25.self_attn.k_proj",
198
- "vision_model.model.layers.25.self_attn.v_proj",
199
- "vision_model.model.layers.25.self_attn.o_proj",
200
- "vision_model.model.layers.25.mlp.fc1",
201
- "vision_model.model.layers.25.mlp.fc2",
202
- "vision_model.model.layers.26.self_attn.q_proj",
203
- "vision_model.model.layers.26.self_attn.k_proj",
204
- "vision_model.model.layers.26.self_attn.v_proj",
205
- "vision_model.model.layers.26.self_attn.o_proj",
206
- "vision_model.model.layers.26.mlp.fc1",
207
- "vision_model.model.layers.26.mlp.fc2",
208
- "vision_model.model.layers.27.self_attn.q_proj",
209
- "vision_model.model.layers.27.self_attn.k_proj",
210
- "vision_model.model.layers.27.self_attn.v_proj",
211
- "vision_model.model.layers.27.self_attn.o_proj",
212
- "vision_model.model.layers.27.mlp.fc1",
213
- "vision_model.model.layers.27.mlp.fc2",
214
- "vision_model.model.layers.28.self_attn.q_proj",
215
- "vision_model.model.layers.28.self_attn.k_proj",
216
- "vision_model.model.layers.28.self_attn.v_proj",
217
- "vision_model.model.layers.28.self_attn.o_proj",
218
- "vision_model.model.layers.28.mlp.fc1",
219
- "vision_model.model.layers.28.mlp.fc2",
220
- "vision_model.model.layers.29.self_attn.q_proj",
221
- "vision_model.model.layers.29.self_attn.k_proj",
222
- "vision_model.model.layers.29.self_attn.v_proj",
223
- "vision_model.model.layers.29.self_attn.o_proj",
224
- "vision_model.model.layers.29.mlp.fc1",
225
- "vision_model.model.layers.29.mlp.fc2",
226
- "vision_model.model.layers.30.self_attn.q_proj",
227
- "vision_model.model.layers.30.self_attn.k_proj",
228
- "vision_model.model.layers.30.self_attn.v_proj",
229
- "vision_model.model.layers.30.self_attn.o_proj",
230
- "vision_model.model.layers.30.mlp.fc1",
231
- "vision_model.model.layers.30.mlp.fc2",
232
- "vision_model.model.layers.31.self_attn.q_proj",
233
- "vision_model.model.layers.31.self_attn.k_proj",
234
- "vision_model.model.layers.31.self_attn.v_proj",
235
- "vision_model.model.layers.31.self_attn.o_proj",
236
- "vision_model.model.layers.31.mlp.fc1",
237
- "vision_model.model.layers.31.mlp.fc2",
238
- "vision_model.model.layers.32.self_attn.q_proj",
239
- "vision_model.model.layers.32.self_attn.k_proj",
240
- "vision_model.model.layers.32.self_attn.v_proj",
241
- "vision_model.model.layers.32.self_attn.o_proj",
242
- "vision_model.model.layers.32.mlp.fc1",
243
- "vision_model.model.layers.32.mlp.fc2",
244
- "vision_model.model.layers.33.self_attn.q_proj",
245
- "vision_model.model.layers.33.self_attn.k_proj",
246
- "vision_model.model.layers.33.self_attn.v_proj",
247
- "vision_model.model.layers.33.self_attn.o_proj",
248
- "vision_model.model.layers.33.mlp.fc1",
249
- "vision_model.model.layers.33.mlp.fc2",
250
- "vision_model.vision_adapter.mlp.fc1",
251
- "vision_model.vision_adapter.mlp.fc2",
252
- "multi_modal_projector.linear_1",
253
- "language_model.model.layers.0.self_attn.q_proj",
254
- "language_model.model.layers.0.self_attn.k_proj",
255
- "language_model.model.layers.0.self_attn.v_proj",
256
- "language_model.model.layers.0.self_attn.o_proj",
257
- "language_model.model.layers.0.feed_forward.router",
258
- "language_model.model.layers.0.feed_forward.shared_expert.gate_proj",
259
- "language_model.model.layers.0.feed_forward.shared_expert.up_proj",
260
- "language_model.model.layers.0.feed_forward.shared_expert.down_proj",
261
- "language_model.model.layers.1.self_attn.q_proj",
262
- "language_model.model.layers.1.self_attn.k_proj",
263
- "language_model.model.layers.1.self_attn.v_proj",
264
- "language_model.model.layers.1.self_attn.o_proj",
265
- "language_model.model.layers.1.feed_forward.router",
266
- "language_model.model.layers.1.feed_forward.shared_expert.gate_proj",
267
- "language_model.model.layers.1.feed_forward.shared_expert.up_proj",
268
- "language_model.model.layers.1.feed_forward.shared_expert.down_proj",
269
- "language_model.model.layers.2.self_attn.q_proj",
270
- "language_model.model.layers.2.self_attn.k_proj",
271
- "language_model.model.layers.2.self_attn.v_proj",
272
- "language_model.model.layers.2.self_attn.o_proj",
273
- "language_model.model.layers.2.feed_forward.router",
274
- "language_model.model.layers.2.feed_forward.shared_expert.gate_proj",
275
- "language_model.model.layers.2.feed_forward.shared_expert.up_proj",
276
- "language_model.model.layers.2.feed_forward.shared_expert.down_proj",
277
- "language_model.model.layers.3.self_attn.q_proj",
278
- "language_model.model.layers.3.self_attn.k_proj",
279
- "language_model.model.layers.3.self_attn.v_proj",
280
- "language_model.model.layers.3.self_attn.o_proj",
281
- "language_model.model.layers.3.feed_forward.router",
282
- "language_model.model.layers.3.feed_forward.shared_expert.gate_proj",
283
- "language_model.model.layers.3.feed_forward.shared_expert.up_proj",
284
- "language_model.model.layers.3.feed_forward.shared_expert.down_proj",
285
- "language_model.model.layers.4.self_attn.q_proj",
286
- "language_model.model.layers.4.self_attn.k_proj",
287
- "language_model.model.layers.4.self_attn.v_proj",
288
- "language_model.model.layers.4.self_attn.o_proj",
289
- "language_model.model.layers.4.feed_forward.router",
290
- "language_model.model.layers.4.feed_forward.shared_expert.gate_proj",
291
- "language_model.model.layers.4.feed_forward.shared_expert.up_proj",
292
- "language_model.model.layers.4.feed_forward.shared_expert.down_proj",
293
- "language_model.model.layers.5.self_attn.q_proj",
294
- "language_model.model.layers.5.self_attn.k_proj",
295
- "language_model.model.layers.5.self_attn.v_proj",
296
- "language_model.model.layers.5.self_attn.o_proj",
297
- "language_model.model.layers.5.feed_forward.router",
298
- "language_model.model.layers.5.feed_forward.shared_expert.gate_proj",
299
- "language_model.model.layers.5.feed_forward.shared_expert.up_proj",
300
- "language_model.model.layers.5.feed_forward.shared_expert.down_proj",
301
- "language_model.model.layers.6.self_attn.q_proj",
302
- "language_model.model.layers.6.self_attn.k_proj",
303
- "language_model.model.layers.6.self_attn.v_proj",
304
- "language_model.model.layers.6.self_attn.o_proj",
305
- "language_model.model.layers.6.feed_forward.router",
306
- "language_model.model.layers.6.feed_forward.shared_expert.gate_proj",
307
- "language_model.model.layers.6.feed_forward.shared_expert.up_proj",
308
- "language_model.model.layers.6.feed_forward.shared_expert.down_proj",
309
- "language_model.model.layers.7.self_attn.q_proj",
310
- "language_model.model.layers.7.self_attn.k_proj",
311
- "language_model.model.layers.7.self_attn.v_proj",
312
- "language_model.model.layers.7.self_attn.o_proj",
313
- "language_model.model.layers.7.feed_forward.router",
314
- "language_model.model.layers.7.feed_forward.shared_expert.gate_proj",
315
- "language_model.model.layers.7.feed_forward.shared_expert.up_proj",
316
- "language_model.model.layers.7.feed_forward.shared_expert.down_proj",
317
- "language_model.model.layers.8.self_attn.q_proj",
318
- "language_model.model.layers.8.self_attn.k_proj",
319
- "language_model.model.layers.8.self_attn.v_proj",
320
- "language_model.model.layers.8.self_attn.o_proj",
321
- "language_model.model.layers.8.feed_forward.router",
322
- "language_model.model.layers.8.feed_forward.shared_expert.gate_proj",
323
- "language_model.model.layers.8.feed_forward.shared_expert.up_proj",
324
- "language_model.model.layers.8.feed_forward.shared_expert.down_proj",
325
- "language_model.model.layers.9.self_attn.q_proj",
326
- "language_model.model.layers.9.self_attn.k_proj",
327
- "language_model.model.layers.9.self_attn.v_proj",
328
- "language_model.model.layers.9.self_attn.o_proj",
329
- "language_model.model.layers.9.feed_forward.router",
330
- "language_model.model.layers.9.feed_forward.shared_expert.gate_proj",
331
- "language_model.model.layers.9.feed_forward.shared_expert.up_proj",
332
- "language_model.model.layers.9.feed_forward.shared_expert.down_proj",
333
- "language_model.model.layers.10.self_attn.q_proj",
334
- "language_model.model.layers.10.self_attn.k_proj",
335
- "language_model.model.layers.10.self_attn.v_proj",
336
- "language_model.model.layers.10.self_attn.o_proj",
337
- "language_model.model.layers.10.feed_forward.router",
338
- "language_model.model.layers.10.feed_forward.shared_expert.gate_proj",
339
- "language_model.model.layers.10.feed_forward.shared_expert.up_proj",
340
- "language_model.model.layers.10.feed_forward.shared_expert.down_proj",
341
- "language_model.model.layers.11.self_attn.q_proj",
342
- "language_model.model.layers.11.self_attn.k_proj",
343
- "language_model.model.layers.11.self_attn.v_proj",
344
- "language_model.model.layers.11.self_attn.o_proj",
345
- "language_model.model.layers.11.feed_forward.router",
346
- "language_model.model.layers.11.feed_forward.shared_expert.gate_proj",
347
- "language_model.model.layers.11.feed_forward.shared_expert.up_proj",
348
- "language_model.model.layers.11.feed_forward.shared_expert.down_proj",
349
- "language_model.model.layers.12.self_attn.q_proj",
350
- "language_model.model.layers.12.self_attn.k_proj",
351
- "language_model.model.layers.12.self_attn.v_proj",
352
- "language_model.model.layers.12.self_attn.o_proj",
353
- "language_model.model.layers.12.feed_forward.router",
354
- "language_model.model.layers.12.feed_forward.shared_expert.gate_proj",
355
- "language_model.model.layers.12.feed_forward.shared_expert.up_proj",
356
- "language_model.model.layers.12.feed_forward.shared_expert.down_proj",
357
- "language_model.model.layers.13.self_attn.q_proj",
358
- "language_model.model.layers.13.self_attn.k_proj",
359
- "language_model.model.layers.13.self_attn.v_proj",
360
- "language_model.model.layers.13.self_attn.o_proj",
361
- "language_model.model.layers.13.feed_forward.router",
362
- "language_model.model.layers.13.feed_forward.shared_expert.gate_proj",
363
- "language_model.model.layers.13.feed_forward.shared_expert.up_proj",
364
- "language_model.model.layers.13.feed_forward.shared_expert.down_proj",
365
- "language_model.model.layers.14.self_attn.q_proj",
366
- "language_model.model.layers.14.self_attn.k_proj",
367
- "language_model.model.layers.14.self_attn.v_proj",
368
- "language_model.model.layers.14.self_attn.o_proj",
369
- "language_model.model.layers.14.feed_forward.router",
370
- "language_model.model.layers.14.feed_forward.shared_expert.gate_proj",
371
- "language_model.model.layers.14.feed_forward.shared_expert.up_proj",
372
- "language_model.model.layers.14.feed_forward.shared_expert.down_proj",
373
- "language_model.model.layers.15.self_attn.q_proj",
374
- "language_model.model.layers.15.self_attn.k_proj",
375
- "language_model.model.layers.15.self_attn.v_proj",
376
- "language_model.model.layers.15.self_attn.o_proj",
377
- "language_model.model.layers.15.feed_forward.router",
378
- "language_model.model.layers.15.feed_forward.shared_expert.gate_proj",
379
- "language_model.model.layers.15.feed_forward.shared_expert.up_proj",
380
- "language_model.model.layers.15.feed_forward.shared_expert.down_proj",
381
- "language_model.model.layers.16.self_attn.q_proj",
382
- "language_model.model.layers.16.self_attn.k_proj",
383
- "language_model.model.layers.16.self_attn.v_proj",
384
- "language_model.model.layers.16.self_attn.o_proj",
385
- "language_model.model.layers.16.feed_forward.router",
386
- "language_model.model.layers.16.feed_forward.shared_expert.gate_proj",
387
- "language_model.model.layers.16.feed_forward.shared_expert.up_proj",
388
- "language_model.model.layers.16.feed_forward.shared_expert.down_proj",
389
- "language_model.model.layers.17.self_attn.q_proj",
390
- "language_model.model.layers.17.self_attn.k_proj",
391
- "language_model.model.layers.17.self_attn.v_proj",
392
- "language_model.model.layers.17.self_attn.o_proj",
393
- "language_model.model.layers.17.feed_forward.router",
394
- "language_model.model.layers.17.feed_forward.shared_expert.gate_proj",
395
- "language_model.model.layers.17.feed_forward.shared_expert.up_proj",
396
- "language_model.model.layers.17.feed_forward.shared_expert.down_proj",
397
- "language_model.model.layers.18.self_attn.q_proj",
398
- "language_model.model.layers.18.self_attn.k_proj",
399
- "language_model.model.layers.18.self_attn.v_proj",
400
- "language_model.model.layers.18.self_attn.o_proj",
401
- "language_model.model.layers.18.feed_forward.router",
402
- "language_model.model.layers.18.feed_forward.shared_expert.gate_proj",
403
- "language_model.model.layers.18.feed_forward.shared_expert.up_proj",
404
- "language_model.model.layers.18.feed_forward.shared_expert.down_proj",
405
- "language_model.model.layers.19.self_attn.q_proj",
406
- "language_model.model.layers.19.self_attn.k_proj",
407
- "language_model.model.layers.19.self_attn.v_proj",
408
- "language_model.model.layers.19.self_attn.o_proj",
409
- "language_model.model.layers.19.feed_forward.router",
410
- "language_model.model.layers.19.feed_forward.shared_expert.gate_proj",
411
- "language_model.model.layers.19.feed_forward.shared_expert.up_proj",
412
- "language_model.model.layers.19.feed_forward.shared_expert.down_proj",
413
- "language_model.model.layers.20.self_attn.q_proj",
414
- "language_model.model.layers.20.self_attn.k_proj",
415
- "language_model.model.layers.20.self_attn.v_proj",
416
- "language_model.model.layers.20.self_attn.o_proj",
417
- "language_model.model.layers.20.feed_forward.router",
418
- "language_model.model.layers.20.feed_forward.shared_expert.gate_proj",
419
- "language_model.model.layers.20.feed_forward.shared_expert.up_proj",
420
- "language_model.model.layers.20.feed_forward.shared_expert.down_proj",
421
- "language_model.model.layers.21.self_attn.q_proj",
422
- "language_model.model.layers.21.self_attn.k_proj",
423
- "language_model.model.layers.21.self_attn.v_proj",
424
- "language_model.model.layers.21.self_attn.o_proj",
425
- "language_model.model.layers.21.feed_forward.router",
426
- "language_model.model.layers.21.feed_forward.shared_expert.gate_proj",
427
- "language_model.model.layers.21.feed_forward.shared_expert.up_proj",
428
- "language_model.model.layers.21.feed_forward.shared_expert.down_proj",
429
- "language_model.model.layers.22.self_attn.q_proj",
430
- "language_model.model.layers.22.self_attn.k_proj",
431
- "language_model.model.layers.22.self_attn.v_proj",
432
- "language_model.model.layers.22.self_attn.o_proj",
433
- "language_model.model.layers.22.feed_forward.router",
434
- "language_model.model.layers.22.feed_forward.shared_expert.gate_proj",
435
- "language_model.model.layers.22.feed_forward.shared_expert.up_proj",
436
- "language_model.model.layers.22.feed_forward.shared_expert.down_proj",
437
- "language_model.model.layers.23.self_attn.q_proj",
438
- "language_model.model.layers.23.self_attn.k_proj",
439
- "language_model.model.layers.23.self_attn.v_proj",
440
- "language_model.model.layers.23.self_attn.o_proj",
441
- "language_model.model.layers.23.feed_forward.router",
442
- "language_model.model.layers.23.feed_forward.shared_expert.gate_proj",
443
- "language_model.model.layers.23.feed_forward.shared_expert.up_proj",
444
- "language_model.model.layers.23.feed_forward.shared_expert.down_proj",
445
- "language_model.model.layers.24.self_attn.q_proj",
446
- "language_model.model.layers.24.self_attn.k_proj",
447
- "language_model.model.layers.24.self_attn.v_proj",
448
- "language_model.model.layers.24.self_attn.o_proj",
449
- "language_model.model.layers.24.feed_forward.router",
450
- "language_model.model.layers.24.feed_forward.shared_expert.gate_proj",
451
- "language_model.model.layers.24.feed_forward.shared_expert.up_proj",
452
- "language_model.model.layers.24.feed_forward.shared_expert.down_proj",
453
- "language_model.model.layers.25.self_attn.q_proj",
454
- "language_model.model.layers.25.self_attn.k_proj",
455
- "language_model.model.layers.25.self_attn.v_proj",
456
- "language_model.model.layers.25.self_attn.o_proj",
457
- "language_model.model.layers.25.feed_forward.router",
458
- "language_model.model.layers.25.feed_forward.shared_expert.gate_proj",
459
- "language_model.model.layers.25.feed_forward.shared_expert.up_proj",
460
- "language_model.model.layers.25.feed_forward.shared_expert.down_proj",
461
- "language_model.model.layers.26.self_attn.q_proj",
462
- "language_model.model.layers.26.self_attn.k_proj",
463
- "language_model.model.layers.26.self_attn.v_proj",
464
- "language_model.model.layers.26.self_attn.o_proj",
465
- "language_model.model.layers.26.feed_forward.router",
466
- "language_model.model.layers.26.feed_forward.shared_expert.gate_proj",
467
- "language_model.model.layers.26.feed_forward.shared_expert.up_proj",
468
- "language_model.model.layers.26.feed_forward.shared_expert.down_proj",
469
- "language_model.model.layers.27.self_attn.q_proj",
470
- "language_model.model.layers.27.self_attn.k_proj",
471
- "language_model.model.layers.27.self_attn.v_proj",
472
- "language_model.model.layers.27.self_attn.o_proj",
473
- "language_model.model.layers.27.feed_forward.router",
474
- "language_model.model.layers.27.feed_forward.shared_expert.gate_proj",
475
- "language_model.model.layers.27.feed_forward.shared_expert.up_proj",
476
- "language_model.model.layers.27.feed_forward.shared_expert.down_proj",
477
- "language_model.model.layers.28.self_attn.q_proj",
478
- "language_model.model.layers.28.self_attn.k_proj",
479
- "language_model.model.layers.28.self_attn.v_proj",
480
- "language_model.model.layers.28.self_attn.o_proj",
481
- "language_model.model.layers.28.feed_forward.router",
482
- "language_model.model.layers.28.feed_forward.shared_expert.gate_proj",
483
- "language_model.model.layers.28.feed_forward.shared_expert.up_proj",
484
- "language_model.model.layers.28.feed_forward.shared_expert.down_proj",
485
- "language_model.model.layers.29.self_attn.q_proj",
486
- "language_model.model.layers.29.self_attn.k_proj",
487
- "language_model.model.layers.29.self_attn.v_proj",
488
- "language_model.model.layers.29.self_attn.o_proj",
489
- "language_model.model.layers.29.feed_forward.router",
490
- "language_model.model.layers.29.feed_forward.shared_expert.gate_proj",
491
- "language_model.model.layers.29.feed_forward.shared_expert.up_proj",
492
- "language_model.model.layers.29.feed_forward.shared_expert.down_proj",
493
- "language_model.model.layers.30.self_attn.q_proj",
494
- "language_model.model.layers.30.self_attn.k_proj",
495
- "language_model.model.layers.30.self_attn.v_proj",
496
- "language_model.model.layers.30.self_attn.o_proj",
497
- "language_model.model.layers.30.feed_forward.router",
498
- "language_model.model.layers.30.feed_forward.shared_expert.gate_proj",
499
- "language_model.model.layers.30.feed_forward.shared_expert.up_proj",
500
- "language_model.model.layers.30.feed_forward.shared_expert.down_proj",
501
- "language_model.model.layers.31.self_attn.q_proj",
502
- "language_model.model.layers.31.self_attn.k_proj",
503
- "language_model.model.layers.31.self_attn.v_proj",
504
- "language_model.model.layers.31.self_attn.o_proj",
505
- "language_model.model.layers.31.feed_forward.router",
506
- "language_model.model.layers.31.feed_forward.shared_expert.gate_proj",
507
- "language_model.model.layers.31.feed_forward.shared_expert.up_proj",
508
- "language_model.model.layers.31.feed_forward.shared_expert.down_proj",
509
- "language_model.model.layers.32.self_attn.q_proj",
510
- "language_model.model.layers.32.self_attn.k_proj",
511
- "language_model.model.layers.32.self_attn.v_proj",
512
- "language_model.model.layers.32.self_attn.o_proj",
513
- "language_model.model.layers.32.feed_forward.router",
514
- "language_model.model.layers.32.feed_forward.shared_expert.gate_proj",
515
- "language_model.model.layers.32.feed_forward.shared_expert.up_proj",
516
- "language_model.model.layers.32.feed_forward.shared_expert.down_proj",
517
- "language_model.model.layers.33.self_attn.q_proj",
518
- "language_model.model.layers.33.self_attn.k_proj",
519
- "language_model.model.layers.33.self_attn.v_proj",
520
- "language_model.model.layers.33.self_attn.o_proj",
521
- "language_model.model.layers.33.feed_forward.router",
522
- "language_model.model.layers.33.feed_forward.shared_expert.gate_proj",
523
- "language_model.model.layers.33.feed_forward.shared_expert.up_proj",
524
- "language_model.model.layers.33.feed_forward.shared_expert.down_proj",
525
- "language_model.model.layers.34.self_attn.q_proj",
526
- "language_model.model.layers.34.self_attn.k_proj",
527
- "language_model.model.layers.34.self_attn.v_proj",
528
- "language_model.model.layers.34.self_attn.o_proj",
529
- "language_model.model.layers.34.feed_forward.router",
530
- "language_model.model.layers.34.feed_forward.shared_expert.gate_proj",
531
- "language_model.model.layers.34.feed_forward.shared_expert.up_proj",
532
- "language_model.model.layers.34.feed_forward.shared_expert.down_proj",
533
- "language_model.model.layers.35.self_attn.q_proj",
534
- "language_model.model.layers.35.self_attn.k_proj",
535
- "language_model.model.layers.35.self_attn.v_proj",
536
- "language_model.model.layers.35.self_attn.o_proj",
537
- "language_model.model.layers.35.feed_forward.router",
538
- "language_model.model.layers.35.feed_forward.shared_expert.gate_proj",
539
- "language_model.model.layers.35.feed_forward.shared_expert.up_proj",
540
- "language_model.model.layers.35.feed_forward.shared_expert.down_proj",
541
- "language_model.model.layers.36.self_attn.q_proj",
542
- "language_model.model.layers.36.self_attn.k_proj",
543
- "language_model.model.layers.36.self_attn.v_proj",
544
- "language_model.model.layers.36.self_attn.o_proj",
545
- "language_model.model.layers.36.feed_forward.router",
546
- "language_model.model.layers.36.feed_forward.shared_expert.gate_proj",
547
- "language_model.model.layers.36.feed_forward.shared_expert.up_proj",
548
- "language_model.model.layers.36.feed_forward.shared_expert.down_proj",
549
- "language_model.model.layers.37.self_attn.q_proj",
550
- "language_model.model.layers.37.self_attn.k_proj",
551
- "language_model.model.layers.37.self_attn.v_proj",
552
- "language_model.model.layers.37.self_attn.o_proj",
553
- "language_model.model.layers.37.feed_forward.router",
554
- "language_model.model.layers.37.feed_forward.shared_expert.gate_proj",
555
- "language_model.model.layers.37.feed_forward.shared_expert.up_proj",
556
- "language_model.model.layers.37.feed_forward.shared_expert.down_proj",
557
- "language_model.model.layers.38.self_attn.q_proj",
558
- "language_model.model.layers.38.self_attn.k_proj",
559
- "language_model.model.layers.38.self_attn.v_proj",
560
- "language_model.model.layers.38.self_attn.o_proj",
561
- "language_model.model.layers.38.feed_forward.router",
562
- "language_model.model.layers.38.feed_forward.shared_expert.gate_proj",
563
- "language_model.model.layers.38.feed_forward.shared_expert.up_proj",
564
- "language_model.model.layers.38.feed_forward.shared_expert.down_proj",
565
- "language_model.model.layers.39.self_attn.q_proj",
566
- "language_model.model.layers.39.self_attn.k_proj",
567
- "language_model.model.layers.39.self_attn.v_proj",
568
- "language_model.model.layers.39.self_attn.o_proj",
569
- "language_model.model.layers.39.feed_forward.router",
570
- "language_model.model.layers.39.feed_forward.shared_expert.gate_proj",
571
- "language_model.model.layers.39.feed_forward.shared_expert.up_proj",
572
- "language_model.model.layers.39.feed_forward.shared_expert.down_proj",
573
- "language_model.model.layers.40.self_attn.q_proj",
574
- "language_model.model.layers.40.self_attn.k_proj",
575
- "language_model.model.layers.40.self_attn.v_proj",
576
- "language_model.model.layers.40.self_attn.o_proj",
577
- "language_model.model.layers.40.feed_forward.router",
578
- "language_model.model.layers.40.feed_forward.shared_expert.gate_proj",
579
- "language_model.model.layers.40.feed_forward.shared_expert.up_proj",
580
- "language_model.model.layers.40.feed_forward.shared_expert.down_proj",
581
- "language_model.model.layers.41.self_attn.q_proj",
582
- "language_model.model.layers.41.self_attn.k_proj",
583
- "language_model.model.layers.41.self_attn.v_proj",
584
- "language_model.model.layers.41.self_attn.o_proj",
585
- "language_model.model.layers.41.feed_forward.router",
586
- "language_model.model.layers.41.feed_forward.shared_expert.gate_proj",
587
- "language_model.model.layers.41.feed_forward.shared_expert.up_proj",
588
- "language_model.model.layers.41.feed_forward.shared_expert.down_proj",
589
- "language_model.model.layers.42.self_attn.q_proj",
590
- "language_model.model.layers.42.self_attn.k_proj",
591
- "language_model.model.layers.42.self_attn.v_proj",
592
- "language_model.model.layers.42.self_attn.o_proj",
593
- "language_model.model.layers.42.feed_forward.router",
594
- "language_model.model.layers.42.feed_forward.shared_expert.gate_proj",
595
- "language_model.model.layers.42.feed_forward.shared_expert.up_proj",
596
- "language_model.model.layers.42.feed_forward.shared_expert.down_proj",
597
- "language_model.model.layers.43.self_attn.q_proj",
598
- "language_model.model.layers.43.self_attn.k_proj",
599
- "language_model.model.layers.43.self_attn.v_proj",
600
- "language_model.model.layers.43.self_attn.o_proj",
601
- "language_model.model.layers.43.feed_forward.router",
602
- "language_model.model.layers.43.feed_forward.shared_expert.gate_proj",
603
- "language_model.model.layers.43.feed_forward.shared_expert.up_proj",
604
- "language_model.model.layers.43.feed_forward.shared_expert.down_proj",
605
- "language_model.model.layers.44.self_attn.q_proj",
606
- "language_model.model.layers.44.self_attn.k_proj",
607
- "language_model.model.layers.44.self_attn.v_proj",
608
- "language_model.model.layers.44.self_attn.o_proj",
609
- "language_model.model.layers.44.feed_forward.router",
610
- "language_model.model.layers.44.feed_forward.shared_expert.gate_proj",
611
- "language_model.model.layers.44.feed_forward.shared_expert.up_proj",
612
- "language_model.model.layers.44.feed_forward.shared_expert.down_proj",
613
- "language_model.model.layers.45.self_attn.q_proj",
614
- "language_model.model.layers.45.self_attn.k_proj",
615
- "language_model.model.layers.45.self_attn.v_proj",
616
- "language_model.model.layers.45.self_attn.o_proj",
617
- "language_model.model.layers.45.feed_forward.router",
618
- "language_model.model.layers.45.feed_forward.shared_expert.gate_proj",
619
- "language_model.model.layers.45.feed_forward.shared_expert.up_proj",
620
- "language_model.model.layers.45.feed_forward.shared_expert.down_proj",
621
- "language_model.model.layers.46.self_attn.q_proj",
622
- "language_model.model.layers.46.self_attn.k_proj",
623
- "language_model.model.layers.46.self_attn.v_proj",
624
- "language_model.model.layers.46.self_attn.o_proj",
625
- "language_model.model.layers.46.feed_forward.router",
626
- "language_model.model.layers.46.feed_forward.shared_expert.gate_proj",
627
- "language_model.model.layers.46.feed_forward.shared_expert.up_proj",
628
- "language_model.model.layers.46.feed_forward.shared_expert.down_proj",
629
- "language_model.model.layers.47.self_attn.q_proj",
630
- "language_model.model.layers.47.self_attn.k_proj",
631
- "language_model.model.layers.47.self_attn.v_proj",
632
- "language_model.model.layers.47.self_attn.o_proj",
633
- "language_model.model.layers.47.feed_forward.router",
634
- "language_model.model.layers.47.feed_forward.shared_expert.gate_proj",
635
- "language_model.model.layers.47.feed_forward.shared_expert.up_proj",
636
- "language_model.model.layers.47.feed_forward.shared_expert.down_proj",
637
- "language_model.lm_head"
638
- ],
639
- "kv_cache_scheme": null,
640
- "quant_method": "compressed-tensors",
641
- "quantization_status": "compressed"
642
- },
643
- "text_config": {
644
- "_attn_implementation_autoset": true,
645
- "attention_bias": false,
646
- "attention_chunk_size": 8192,
647
- "attention_dropout": 0.0,
648
- "attn_scale": 0.1,
649
- "attn_temperature_tuning": 4,
650
- "bos_token_id": 200000,
651
- "eos_token_id": [
652
- 200001,
653
- 200007,
654
- 200008
655
- ],
656
- "floor_scale": 8192,
657
- "for_llm_compressor": true,
658
- "head_dim": 128,
659
- "hidden_act": "silu",
660
- "hidden_size": 5120,
661
- "initializer_range": 0.02,
662
- "interleave_moe_layer_step": 1,
663
- "intermediate_size": 8192,
664
- "intermediate_size_mlp": 16384,
665
- "max_position_embeddings": 10485760,
666
- "model_type": "llama4_text",
667
- "moe_layers": [
668
- 0,
669
- 1,
670
- 2,
671
- 3,
672
- 4,
673
- 5,
674
- 6,
675
- 7,
676
- 8,
677
- 9,
678
- 10,
679
- 11,
680
- 12,
681
- 13,
682
- 14,
683
- 15,
684
- 16,
685
- 17,
686
- 18,
687
- 19,
688
- 20,
689
- 21,
690
- 22,
691
- 23,
692
- 24,
693
- 25,
694
- 26,
695
- 27,
696
- 28,
697
- 29,
698
- 30,
699
- 31,
700
- 32,
701
- 33,
702
- 34,
703
- 35,
704
- 36,
705
- 37,
706
- 38,
707
- 39,
708
- 40,
709
- 41,
710
- 42,
711
- 43,
712
- 44,
713
- 45,
714
- 46,
715
- 47
716
- ],
717
- "no_rope_layers": [
718
- 1,
719
- 1,
720
- 1,
721
- 0,
722
- 1,
723
- 1,
724
- 1,
725
- 0,
726
- 1,
727
- 1,
728
- 1,
729
- 0,
730
- 1,
731
- 1,
732
- 1,
733
- 0,
734
- 1,
735
- 1,
736
- 1,
737
- 0,
738
- 1,
739
- 1,
740
- 1,
741
- 0,
742
- 1,
743
- 1,
744
- 1,
745
- 0,
746
- 1,
747
- 1,
748
- 1,
749
- 0,
750
- 1,
751
- 1,
752
- 1,
753
- 0,
754
- 1,
755
- 1,
756
- 1,
757
- 0,
758
- 1,
759
- 1,
760
- 1,
761
- 0,
762
- 1,
763
- 1,
764
- 1,
765
- 0
766
- ],
767
- "num_attention_heads": 40,
768
- "num_experts_per_tok": 1,
769
- "num_hidden_layers": 48,
770
- "num_key_value_heads": 8,
771
- "num_local_experts": 16,
772
- "output_router_logits": false,
773
- "pad_token_id": 200018,
774
- "rms_norm_eps": 1e-05,
775
- "rope_scaling": {
776
- "factor": 16.0,
777
- "high_freq_factor": 1.0,
778
- "low_freq_factor": 1.0,
779
- "original_max_position_embeddings": 8192,
780
- "rope_type": "llama3"
781
- },
782
- "rope_theta": 500000.0,
783
- "router_aux_loss_coef": 0.001,
784
- "router_jitter_noise": 0.0,
785
- "torch_dtype": "bfloat16",
786
- "use_cache": true,
787
- "use_qk_norm": true,
788
- "vocab_size": 202048
789
- },
790
- "tie_word_embeddings": false,
791
- "torch_dtype": "bfloat16",
792
- "transformers_version": "4.51.0.dev0",
793
- "vision_config": {
794
- "_attn_implementation_autoset": true,
795
- "attention_dropout": 0.0,
796
- "hidden_act": "gelu",
797
- "hidden_size": 1408,
798
- "image_size": 336,
799
- "initializer_range": 0.02,
800
- "intermediate_size": 5632,
801
- "model_type": "llama4_vision_model",
802
- "multi_modal_projector_bias": false,
803
- "norm_eps": 1e-05,
804
- "num_attention_heads": 16,
805
- "num_channels": 3,
806
- "num_hidden_layers": 34,
807
- "patch_size": 14,
808
- "pixel_shuffle_ratio": 0.5,
809
- "projector_dropout": 0.0,
810
- "projector_input_dim": 4096,
811
- "projector_output_dim": 4096,
812
- "rope_theta": 10000,
813
- "vision_feature_layer": -1,
814
- "vision_feature_select_strategy": "default",
815
- "vision_output_dim": 4096
816
- }
817
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model-00001-of-00025.safetensors → model-00001-of-00023.safetensors RENAMED
File without changes
model-00004-of-00025.safetensors → model-00002-of-00023.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f49cb8d3ddc44a715dae3da6db66b2d729fb628259311163694fc726e58008a5
3
- size 4993069176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45564a59bceae860c561c0df3a749fc9cd53f9544f0f3eb7e20c3ea3abaeca50
3
+ size 4993243544
model-00005-of-00025.safetensors → model-00003-of-00023.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:94ced100083cd715d2bac71163717f3e204d30cd36c0eb1aa74fb3d980205004
3
- size 4993069200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d3709c7ec359d32f8a2e5087abf949fa142aa277e44036af4532b654c412bee
3
+ size 4993249744
model-00002-of-00025.safetensors → model-00004-of-00023.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02fc570f74e96c1fd2ee828e4ae3d69dbdaa6554cc5768be4d0e5f3fa7b7ac0a
3
- size 4993069104
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfbd86bd62812b58da4002f26e43d8ee6f6842a30c84cdc6d209dc08a47c80a9
3
+ size 4993385008
model-00003-of-00025.safetensors → model-00005-of-00023.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6fac21aa1fa6d3b65273078a19903d093998c0db46cfdca619fab70758fa5df9
3
- size 4993075288
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a2bdbd7070ebe089dd41dcd30d0974c846923fb28b864a34625828c8bb4720e
3
+ size 4993243448
model-00006-of-00023.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:086ff4c6faf089ed2dcc7e0526d0a88cec285ffbaa7022058f3f3e2028b5e00c
3
+ size 4993249968
model-00006-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:90bcea4c3fb40c6f3a6975e3380a497368738ba5df686414809ebdfe431141fc
3
- size 4993205960
 
 
 
 
model-00007-of-00023.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c714823d9aa502d897584058d4591782582f98d97fe11533994f2ebce6c39c2
3
+ size 4993243800
model-00007-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:30fa834e2583a5bba4ab7452c859ff9bb5da6ef224b4dd58273f0a445b178408
3
- size 4951076352
 
 
 
 
model-00008-of-00023.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39da01bc2ff9b736889b0d1bb4315ca841823e0bbd1a31fb0049e58e51ac9f8b
3
+ size 4993407816
model-00008-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e7acbfde1a6a3fd546c05bee4262ac88b30a1f6e0242fb545926c7b1f705465
3
- size 4993031936
 
 
 
 
model-00009-of-00023.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40f590c5cbb908cc5c8d317594859894db13bcac090a7fd46b6d2818197816cc
3
+ size 4993227400
model-00009-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe1f0f050e4c519d47d1d5ec852275c4b0e7bb2799bb3195c5e9e097a62720d7
3
- size 4993069192
 
 
 
 
model-00010-of-00023.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97f05df75e2df713003387ede5cc74881be14267e3b7ac14eec84353a25da127
3
+ size 4993243696
model-00010-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6836ce9eb1d9b5b15a98e99937d5d924eff206ad290cbde08c8f6de37a41c38
3
- size 4993069216
 
 
 
 
model-00011-of-00023.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1d7b546425089d7f141df49c480b4ad85c925993e9507f351d470bdf731aadd
3
+ size 4993243736
model-00011-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b6546b902d40930ccbee985371e412ff9e93421bb699d7ad7a2cf4da45f1c61
3
- size 4993075392
 
 
 
 
model-00012-of-00023.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48e491ca3534c4e178a6fd7c8f2560b01432a0915cae017d735b51aa01d131f6
3
+ size 4993249944
model-00012-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3dbe728b83b6f0abda059177b9ef469eaaeef59355fb78c2efc50a94fbd998de
3
- size 4993069280
 
 
 
 
model-00013-of-00023.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01fb18922b16aa486f4b86dc435125eb4b375c189186f07179fdd6a502cb237f
3
+ size 4993407832
model-00013-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c661d5d69debe21c8d31e549a203bebaeebfd4b694b114d38a571d00fec6ec0
3
- size 4993069304
 
 
 
 
model-00014-of-00023.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9aa009c000001a690cb7e4106639c19862a5b2deea42f6ae4b7121decbfaa22
3
+ size 4993221256
model-00014-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa24e1409f811ca22955375b79166a537b6fc5b53f1a6c418e6405be6e89bc08
3
- size 4993075480
 
 
 
 
model-00015-of-00023.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d67b71b3b97463e83aadf441ffa9b6acb53408420c42c3e566dd023404174593
3
+ size 4993249848
model-00015-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:377f9bed46d56b0e82156b7f35d2ffd2cf40621c929d62c0168f943c1a30ff42
3
- size 4993069376
 
 
 
 
model-00016-of-00023.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4d43bdb18deaaf45a819c87f680e69c6b792449982efb8c300d1599b62cbe04
3
+ size 4993243744
model-00016-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed39b016dc2b3f68d1b0c60a60cb4ca44b58fbcf39cf70c54696fd1d2936493f
3
- size 4993069408
 
 
 
 
model-00017-of-00023.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b1470c606ee6cf40a30b6b9639b5f03cb3f005542d4a120a9edc7bca3bf3d8f
3
+ size 4993243808
model-00017-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cfa91c9d8734d8c524b755d51abfc04dec652efcfd868e1905a28289ca31c76e
3
- size 4993075584
 
 
 
 
model-00018-of-00023.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a7c1418d67d020178d1eec2b95a94f1f18e8663186a4e0273a62cc9720fe637
3
+ size 4993413984
model-00018-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e514d79585ee2d25edb3a8b7aeebddcdf4d3d964fd36cbb9780c3cf7ebffd573
3
- size 4993173160
 
 
 
 
model-00019-of-00023.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2640c96edc392c6d34f7bba8a7c02dba377b31d1c4719abfed3126ae76c458d
3
+ size 4993221256
model-00019-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:397722313f499afbc6da44a0110c5f3b067d3e8aeb6807f1aba051bbfa81452e
3
- size 4993019672
 
 
 
 
model-00020-of-00023.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e527ea23f0451c33ace375998d9923b8596f91db1298be0d5b9fbd61286764cb
3
+ size 4993243704
model-00020-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4406a44e8804aa9c8ea21b6b96369ca772d373bb04d078b714b7306676852272
3
- size 4993075320
 
 
 
 
model-00021-of-00023.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7a9fa2abdef027a6c0b606834d006715aaec059de76eabf78ea382117d312eb
3
+ size 4993249896
model-00021-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8444521428980f70c2d46afe868a946e5bb2c7404302d31b725c3d9cf9cb8deb
3
- size 4993069208
 
 
 
 
model-00022-of-00023.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b5275c8540db1f40baf698cf480c844f534fbdc918717ddc654f49c3fe27f45
3
+ size 4993243816
model-00022-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:11b10547da072ddfc27d43f73119e557479f4e829440c3b697b6121c069f99b8
3
- size 4993069232
 
 
 
 
model-00023-of-00023.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77ac77a23183abb72299176e5782765908f48c37c480b3256686046ce365e64a
3
+ size 4796555224
model-00023-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:076ae740ff6265b501bd556aad41f666e005d6d3d1d6d9e32995e8c777a4859b
3
- size 4993075408
 
 
 
 
model-00024-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:19964b0364c221a1cc8fba57d2bdc20382af117e25225887b0eab416705877e3
3
- size 3818306024
 
 
 
 
model-00025-of-00025.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f412ae287747cc86e6ee5d58600bbe5c574d6fa77f351367aae9ea3f4dbcc3a
3
- size 2068971664
 
 
 
 
model.safetensors.index.json CHANGED
The diff for this file is too large to render. See raw diff
 
recipe.yaml CHANGED
@@ -8,7 +8,5 @@ default_stage:
8
  input_activations: {num_bits: 8, type: float, symmetric: true, strategy: token,
9
  dynamic: true, observer: null}
10
  output_activations: null
11
- ignore: ['re:.*lm_head', 're:.*self_attn', 're:.*router', 're:.*vision_model', 're:.*multi_modal_projector',
12
- 're:.*shared_expert', 're:.*feed_forward.gate_proj', 're:.*feed_forward.up_proj',
13
- 're:.*feed_forward.down_proj']
14
  targets: [Linear]
 
8
  input_activations: {num_bits: 8, type: float, symmetric: true, strategy: token,
9
  dynamic: true, observer: null}
10
  output_activations: null
11
+ ignore: ['re:.*lm_head', 're:.*self_attn', 're:.*router', 're:.*vision_model', 're:.*multi_modal_projector']
 
 
12
  targets: [Linear]