MagicXin commited on
Commit
2620266
·
verified ·
1 Parent(s): 8183abe

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
3
  "architectures": [
4
  "VLMQwenForCausalLM"
5
  ],
@@ -57,7 +57,7 @@
57
  "rope_theta": 1000000.0,
58
  "sliding_window": null,
59
  "tie_word_embeddings": false,
60
- "torch_dtype": "bfloat16",
61
  "transformers_version": "4.48.3",
62
  "use_cache": true,
63
  "use_sliding_window": false,
 
1
  {
2
+ "_name_or_path": "/home/yu.xin/weishao/Med3DVLM/models/Med3DVLM-Qwen-2.5-7B",
3
  "architectures": [
4
  "VLMQwenForCausalLM"
5
  ],
 
57
  "rope_theta": 1000000.0,
58
  "sliding_window": null,
59
  "tie_word_embeddings": false,
60
+ "torch_dtype": "float32",
61
  "transformers_version": "4.48.3",
62
  "use_cache": true,
63
  "use_sliding_window": false,
model-00006-of-00007.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d577d20479436de45481d01c6f33e45b8cda3fb9188acc3ec0d3a4960a84e44
3
  size 3924909896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af0b8f367ee0be887dc1ccd8b5c04d67935fd7eaafab5ebb8e3b3f73ddbca0af
3
  size 3924909896
model.safetensors.index.json CHANGED
@@ -495,12 +495,12 @@
495
  "model.vision_tower.vision_tower.encoder.s1.0.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
496
  "model.vision_tower.vision_tower.encoder.s1.0.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
497
  "model.vision_tower.vision_tower.encoder.s1.0.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
498
- "model.vision_tower.vision_tower.encoder.s1.0.gamma": "model-00006-of-00007.safetensors",
499
  "model.vision_tower.vision_tower.encoder.s1.0.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
500
  "model.vision_tower.vision_tower.encoder.s1.0.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
501
  "model.vision_tower.vision_tower.encoder.s1.0.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
502
  "model.vision_tower.vision_tower.encoder.s1.0.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
503
  "model.vision_tower.vision_tower.encoder.s1.0.proj.weight": "model-00006-of-00007.safetensors",
 
504
  "model.vision_tower.vision_tower.encoder.s1.1.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
505
  "model.vision_tower.vision_tower.encoder.s1.1.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
506
  "model.vision_tower.vision_tower.encoder.s1.1.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
@@ -522,11 +522,11 @@
522
  "model.vision_tower.vision_tower.encoder.s1.1.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
523
  "model.vision_tower.vision_tower.encoder.s1.1.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
524
  "model.vision_tower.vision_tower.encoder.s1.1.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
525
- "model.vision_tower.vision_tower.encoder.s1.1.gamma": "model-00006-of-00007.safetensors",
526
  "model.vision_tower.vision_tower.encoder.s1.1.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
527
  "model.vision_tower.vision_tower.encoder.s1.1.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
528
  "model.vision_tower.vision_tower.encoder.s1.1.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
529
  "model.vision_tower.vision_tower.encoder.s1.1.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
 
530
  "model.vision_tower.vision_tower.encoder.s2.0.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
531
  "model.vision_tower.vision_tower.encoder.s2.0.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
532
  "model.vision_tower.vision_tower.encoder.s2.0.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
@@ -548,12 +548,12 @@
548
  "model.vision_tower.vision_tower.encoder.s2.0.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
549
  "model.vision_tower.vision_tower.encoder.s2.0.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
550
  "model.vision_tower.vision_tower.encoder.s2.0.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
551
- "model.vision_tower.vision_tower.encoder.s2.0.gamma": "model-00006-of-00007.safetensors",
552
  "model.vision_tower.vision_tower.encoder.s2.0.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
553
  "model.vision_tower.vision_tower.encoder.s2.0.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
554
  "model.vision_tower.vision_tower.encoder.s2.0.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
555
  "model.vision_tower.vision_tower.encoder.s2.0.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
556
  "model.vision_tower.vision_tower.encoder.s2.0.proj.weight": "model-00006-of-00007.safetensors",
 
557
  "model.vision_tower.vision_tower.encoder.s2.1.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
558
  "model.vision_tower.vision_tower.encoder.s2.1.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
559
  "model.vision_tower.vision_tower.encoder.s2.1.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
@@ -575,11 +575,11 @@
575
  "model.vision_tower.vision_tower.encoder.s2.1.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
576
  "model.vision_tower.vision_tower.encoder.s2.1.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
577
  "model.vision_tower.vision_tower.encoder.s2.1.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
578
- "model.vision_tower.vision_tower.encoder.s2.1.gamma": "model-00006-of-00007.safetensors",
579
  "model.vision_tower.vision_tower.encoder.s2.1.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
580
  "model.vision_tower.vision_tower.encoder.s2.1.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
581
  "model.vision_tower.vision_tower.encoder.s2.1.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
582
  "model.vision_tower.vision_tower.encoder.s2.1.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
 
583
  "model.vision_tower.vision_tower.encoder.s2.2.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
584
  "model.vision_tower.vision_tower.encoder.s2.2.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
585
  "model.vision_tower.vision_tower.encoder.s2.2.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
@@ -601,11 +601,11 @@
601
  "model.vision_tower.vision_tower.encoder.s2.2.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
602
  "model.vision_tower.vision_tower.encoder.s2.2.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
603
  "model.vision_tower.vision_tower.encoder.s2.2.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
604
- "model.vision_tower.vision_tower.encoder.s2.2.gamma": "model-00006-of-00007.safetensors",
605
  "model.vision_tower.vision_tower.encoder.s2.2.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
606
  "model.vision_tower.vision_tower.encoder.s2.2.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
607
  "model.vision_tower.vision_tower.encoder.s2.2.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
608
  "model.vision_tower.vision_tower.encoder.s2.2.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
 
609
  "model.vision_tower.vision_tower.encoder.s3.0.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
610
  "model.vision_tower.vision_tower.encoder.s3.0.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
611
  "model.vision_tower.vision_tower.encoder.s3.0.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
@@ -627,12 +627,12 @@
627
  "model.vision_tower.vision_tower.encoder.s3.0.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
628
  "model.vision_tower.vision_tower.encoder.s3.0.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
629
  "model.vision_tower.vision_tower.encoder.s3.0.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
630
- "model.vision_tower.vision_tower.encoder.s3.0.gamma": "model-00006-of-00007.safetensors",
631
  "model.vision_tower.vision_tower.encoder.s3.0.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
632
  "model.vision_tower.vision_tower.encoder.s3.0.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
633
  "model.vision_tower.vision_tower.encoder.s3.0.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
634
  "model.vision_tower.vision_tower.encoder.s3.0.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
635
  "model.vision_tower.vision_tower.encoder.s3.0.proj.weight": "model-00006-of-00007.safetensors",
 
636
  "model.vision_tower.vision_tower.encoder.s3.1.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
637
  "model.vision_tower.vision_tower.encoder.s3.1.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
638
  "model.vision_tower.vision_tower.encoder.s3.1.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
@@ -654,11 +654,11 @@
654
  "model.vision_tower.vision_tower.encoder.s3.1.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
655
  "model.vision_tower.vision_tower.encoder.s3.1.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
656
  "model.vision_tower.vision_tower.encoder.s3.1.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
657
- "model.vision_tower.vision_tower.encoder.s3.1.gamma": "model-00006-of-00007.safetensors",
658
  "model.vision_tower.vision_tower.encoder.s3.1.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
659
  "model.vision_tower.vision_tower.encoder.s3.1.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
660
  "model.vision_tower.vision_tower.encoder.s3.1.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
661
  "model.vision_tower.vision_tower.encoder.s3.1.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
 
662
  "model.vision_tower.vision_tower.encoder.s3.2.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
663
  "model.vision_tower.vision_tower.encoder.s3.2.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
664
  "model.vision_tower.vision_tower.encoder.s3.2.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
@@ -680,11 +680,11 @@
680
  "model.vision_tower.vision_tower.encoder.s3.2.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
681
  "model.vision_tower.vision_tower.encoder.s3.2.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
682
  "model.vision_tower.vision_tower.encoder.s3.2.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
683
- "model.vision_tower.vision_tower.encoder.s3.2.gamma": "model-00006-of-00007.safetensors",
684
  "model.vision_tower.vision_tower.encoder.s3.2.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
685
  "model.vision_tower.vision_tower.encoder.s3.2.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
686
  "model.vision_tower.vision_tower.encoder.s3.2.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
687
  "model.vision_tower.vision_tower.encoder.s3.2.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
 
688
  "model.vision_tower.vision_tower.encoder.s3.3.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
689
  "model.vision_tower.vision_tower.encoder.s3.3.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
690
  "model.vision_tower.vision_tower.encoder.s3.3.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
@@ -706,11 +706,11 @@
706
  "model.vision_tower.vision_tower.encoder.s3.3.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
707
  "model.vision_tower.vision_tower.encoder.s3.3.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
708
  "model.vision_tower.vision_tower.encoder.s3.3.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
709
- "model.vision_tower.vision_tower.encoder.s3.3.gamma": "model-00006-of-00007.safetensors",
710
  "model.vision_tower.vision_tower.encoder.s3.3.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
711
  "model.vision_tower.vision_tower.encoder.s3.3.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
712
  "model.vision_tower.vision_tower.encoder.s3.3.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
713
  "model.vision_tower.vision_tower.encoder.s3.3.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
 
714
  "model.vision_tower.vision_tower.encoder.s3.4.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
715
  "model.vision_tower.vision_tower.encoder.s3.4.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
716
  "model.vision_tower.vision_tower.encoder.s3.4.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
@@ -732,11 +732,11 @@
732
  "model.vision_tower.vision_tower.encoder.s3.4.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
733
  "model.vision_tower.vision_tower.encoder.s3.4.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
734
  "model.vision_tower.vision_tower.encoder.s3.4.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
735
- "model.vision_tower.vision_tower.encoder.s3.4.gamma": "model-00006-of-00007.safetensors",
736
  "model.vision_tower.vision_tower.encoder.s3.4.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
737
  "model.vision_tower.vision_tower.encoder.s3.4.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
738
  "model.vision_tower.vision_tower.encoder.s3.4.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
739
  "model.vision_tower.vision_tower.encoder.s3.4.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
 
740
  "model.vision_tower.vision_tower.encoder.s3.5.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
741
  "model.vision_tower.vision_tower.encoder.s3.5.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
742
  "model.vision_tower.vision_tower.encoder.s3.5.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
@@ -758,11 +758,11 @@
758
  "model.vision_tower.vision_tower.encoder.s3.5.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
759
  "model.vision_tower.vision_tower.encoder.s3.5.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
760
  "model.vision_tower.vision_tower.encoder.s3.5.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
761
- "model.vision_tower.vision_tower.encoder.s3.5.gamma": "model-00006-of-00007.safetensors",
762
  "model.vision_tower.vision_tower.encoder.s3.5.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
763
  "model.vision_tower.vision_tower.encoder.s3.5.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
764
  "model.vision_tower.vision_tower.encoder.s3.5.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
765
  "model.vision_tower.vision_tower.encoder.s3.5.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
 
766
  "model.vision_tower.vision_tower.encoder.s4.0.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
767
  "model.vision_tower.vision_tower.encoder.s4.0.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
768
  "model.vision_tower.vision_tower.encoder.s4.0.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
@@ -784,12 +784,12 @@
784
  "model.vision_tower.vision_tower.encoder.s4.0.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
785
  "model.vision_tower.vision_tower.encoder.s4.0.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
786
  "model.vision_tower.vision_tower.encoder.s4.0.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
787
- "model.vision_tower.vision_tower.encoder.s4.0.gamma": "model-00006-of-00007.safetensors",
788
  "model.vision_tower.vision_tower.encoder.s4.0.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
789
  "model.vision_tower.vision_tower.encoder.s4.0.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
790
  "model.vision_tower.vision_tower.encoder.s4.0.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
791
  "model.vision_tower.vision_tower.encoder.s4.0.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
792
  "model.vision_tower.vision_tower.encoder.s4.0.proj.weight": "model-00006-of-00007.safetensors",
 
793
  "model.vision_tower.vision_tower.encoder.s4.1.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
794
  "model.vision_tower.vision_tower.encoder.s4.1.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
795
  "model.vision_tower.vision_tower.encoder.s4.1.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
@@ -811,10 +811,10 @@
811
  "model.vision_tower.vision_tower.encoder.s4.1.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
812
  "model.vision_tower.vision_tower.encoder.s4.1.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
813
  "model.vision_tower.vision_tower.encoder.s4.1.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
814
- "model.vision_tower.vision_tower.encoder.s4.1.gamma": "model-00006-of-00007.safetensors",
815
  "model.vision_tower.vision_tower.encoder.s4.1.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
816
  "model.vision_tower.vision_tower.encoder.s4.1.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
817
  "model.vision_tower.vision_tower.encoder.s4.1.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
818
- "model.vision_tower.vision_tower.encoder.s4.1.mlp.mlp.3.weight": "model-00006-of-00007.safetensors"
 
819
  }
820
  }
 
495
  "model.vision_tower.vision_tower.encoder.s1.0.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
496
  "model.vision_tower.vision_tower.encoder.s1.0.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
497
  "model.vision_tower.vision_tower.encoder.s1.0.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
 
498
  "model.vision_tower.vision_tower.encoder.s1.0.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
499
  "model.vision_tower.vision_tower.encoder.s1.0.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
500
  "model.vision_tower.vision_tower.encoder.s1.0.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
501
  "model.vision_tower.vision_tower.encoder.s1.0.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
502
  "model.vision_tower.vision_tower.encoder.s1.0.proj.weight": "model-00006-of-00007.safetensors",
503
+ "model.vision_tower.vision_tower.encoder.s1.0.scale": "model-00006-of-00007.safetensors",
504
  "model.vision_tower.vision_tower.encoder.s1.1.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
505
  "model.vision_tower.vision_tower.encoder.s1.1.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
506
  "model.vision_tower.vision_tower.encoder.s1.1.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
 
522
  "model.vision_tower.vision_tower.encoder.s1.1.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
523
  "model.vision_tower.vision_tower.encoder.s1.1.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
524
  "model.vision_tower.vision_tower.encoder.s1.1.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
 
525
  "model.vision_tower.vision_tower.encoder.s1.1.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
526
  "model.vision_tower.vision_tower.encoder.s1.1.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
527
  "model.vision_tower.vision_tower.encoder.s1.1.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
528
  "model.vision_tower.vision_tower.encoder.s1.1.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
529
+ "model.vision_tower.vision_tower.encoder.s1.1.scale": "model-00006-of-00007.safetensors",
530
  "model.vision_tower.vision_tower.encoder.s2.0.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
531
  "model.vision_tower.vision_tower.encoder.s2.0.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
532
  "model.vision_tower.vision_tower.encoder.s2.0.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
 
548
  "model.vision_tower.vision_tower.encoder.s2.0.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
549
  "model.vision_tower.vision_tower.encoder.s2.0.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
550
  "model.vision_tower.vision_tower.encoder.s2.0.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
 
551
  "model.vision_tower.vision_tower.encoder.s2.0.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
552
  "model.vision_tower.vision_tower.encoder.s2.0.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
553
  "model.vision_tower.vision_tower.encoder.s2.0.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
554
  "model.vision_tower.vision_tower.encoder.s2.0.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
555
  "model.vision_tower.vision_tower.encoder.s2.0.proj.weight": "model-00006-of-00007.safetensors",
556
+ "model.vision_tower.vision_tower.encoder.s2.0.scale": "model-00006-of-00007.safetensors",
557
  "model.vision_tower.vision_tower.encoder.s2.1.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
558
  "model.vision_tower.vision_tower.encoder.s2.1.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
559
  "model.vision_tower.vision_tower.encoder.s2.1.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
 
575
  "model.vision_tower.vision_tower.encoder.s2.1.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
576
  "model.vision_tower.vision_tower.encoder.s2.1.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
577
  "model.vision_tower.vision_tower.encoder.s2.1.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
 
578
  "model.vision_tower.vision_tower.encoder.s2.1.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
579
  "model.vision_tower.vision_tower.encoder.s2.1.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
580
  "model.vision_tower.vision_tower.encoder.s2.1.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
581
  "model.vision_tower.vision_tower.encoder.s2.1.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
582
+ "model.vision_tower.vision_tower.encoder.s2.1.scale": "model-00006-of-00007.safetensors",
583
  "model.vision_tower.vision_tower.encoder.s2.2.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
584
  "model.vision_tower.vision_tower.encoder.s2.2.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
585
  "model.vision_tower.vision_tower.encoder.s2.2.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
 
601
  "model.vision_tower.vision_tower.encoder.s2.2.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
602
  "model.vision_tower.vision_tower.encoder.s2.2.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
603
  "model.vision_tower.vision_tower.encoder.s2.2.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
 
604
  "model.vision_tower.vision_tower.encoder.s2.2.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
605
  "model.vision_tower.vision_tower.encoder.s2.2.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
606
  "model.vision_tower.vision_tower.encoder.s2.2.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
607
  "model.vision_tower.vision_tower.encoder.s2.2.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
608
+ "model.vision_tower.vision_tower.encoder.s2.2.scale": "model-00006-of-00007.safetensors",
609
  "model.vision_tower.vision_tower.encoder.s3.0.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
610
  "model.vision_tower.vision_tower.encoder.s3.0.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
611
  "model.vision_tower.vision_tower.encoder.s3.0.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
 
627
  "model.vision_tower.vision_tower.encoder.s3.0.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
628
  "model.vision_tower.vision_tower.encoder.s3.0.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
629
  "model.vision_tower.vision_tower.encoder.s3.0.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
 
630
  "model.vision_tower.vision_tower.encoder.s3.0.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
631
  "model.vision_tower.vision_tower.encoder.s3.0.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
632
  "model.vision_tower.vision_tower.encoder.s3.0.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
633
  "model.vision_tower.vision_tower.encoder.s3.0.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
634
  "model.vision_tower.vision_tower.encoder.s3.0.proj.weight": "model-00006-of-00007.safetensors",
635
+ "model.vision_tower.vision_tower.encoder.s3.0.scale": "model-00006-of-00007.safetensors",
636
  "model.vision_tower.vision_tower.encoder.s3.1.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
637
  "model.vision_tower.vision_tower.encoder.s3.1.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
638
  "model.vision_tower.vision_tower.encoder.s3.1.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
 
654
  "model.vision_tower.vision_tower.encoder.s3.1.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
655
  "model.vision_tower.vision_tower.encoder.s3.1.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
656
  "model.vision_tower.vision_tower.encoder.s3.1.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
 
657
  "model.vision_tower.vision_tower.encoder.s3.1.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
658
  "model.vision_tower.vision_tower.encoder.s3.1.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
659
  "model.vision_tower.vision_tower.encoder.s3.1.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
660
  "model.vision_tower.vision_tower.encoder.s3.1.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
661
+ "model.vision_tower.vision_tower.encoder.s3.1.scale": "model-00006-of-00007.safetensors",
662
  "model.vision_tower.vision_tower.encoder.s3.2.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
663
  "model.vision_tower.vision_tower.encoder.s3.2.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
664
  "model.vision_tower.vision_tower.encoder.s3.2.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
 
680
  "model.vision_tower.vision_tower.encoder.s3.2.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
681
  "model.vision_tower.vision_tower.encoder.s3.2.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
682
  "model.vision_tower.vision_tower.encoder.s3.2.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
 
683
  "model.vision_tower.vision_tower.encoder.s3.2.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
684
  "model.vision_tower.vision_tower.encoder.s3.2.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
685
  "model.vision_tower.vision_tower.encoder.s3.2.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
686
  "model.vision_tower.vision_tower.encoder.s3.2.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
687
+ "model.vision_tower.vision_tower.encoder.s3.2.scale": "model-00006-of-00007.safetensors",
688
  "model.vision_tower.vision_tower.encoder.s3.3.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
689
  "model.vision_tower.vision_tower.encoder.s3.3.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
690
  "model.vision_tower.vision_tower.encoder.s3.3.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
 
706
  "model.vision_tower.vision_tower.encoder.s3.3.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
707
  "model.vision_tower.vision_tower.encoder.s3.3.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
708
  "model.vision_tower.vision_tower.encoder.s3.3.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
 
709
  "model.vision_tower.vision_tower.encoder.s3.3.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
710
  "model.vision_tower.vision_tower.encoder.s3.3.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
711
  "model.vision_tower.vision_tower.encoder.s3.3.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
712
  "model.vision_tower.vision_tower.encoder.s3.3.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
713
+ "model.vision_tower.vision_tower.encoder.s3.3.scale": "model-00006-of-00007.safetensors",
714
  "model.vision_tower.vision_tower.encoder.s3.4.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
715
  "model.vision_tower.vision_tower.encoder.s3.4.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
716
  "model.vision_tower.vision_tower.encoder.s3.4.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
 
732
  "model.vision_tower.vision_tower.encoder.s3.4.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
733
  "model.vision_tower.vision_tower.encoder.s3.4.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
734
  "model.vision_tower.vision_tower.encoder.s3.4.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
 
735
  "model.vision_tower.vision_tower.encoder.s3.4.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
736
  "model.vision_tower.vision_tower.encoder.s3.4.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
737
  "model.vision_tower.vision_tower.encoder.s3.4.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
738
  "model.vision_tower.vision_tower.encoder.s3.4.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
739
+ "model.vision_tower.vision_tower.encoder.s3.4.scale": "model-00006-of-00007.safetensors",
740
  "model.vision_tower.vision_tower.encoder.s3.5.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
741
  "model.vision_tower.vision_tower.encoder.s3.5.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
742
  "model.vision_tower.vision_tower.encoder.s3.5.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
 
758
  "model.vision_tower.vision_tower.encoder.s3.5.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
759
  "model.vision_tower.vision_tower.encoder.s3.5.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
760
  "model.vision_tower.vision_tower.encoder.s3.5.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
 
761
  "model.vision_tower.vision_tower.encoder.s3.5.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
762
  "model.vision_tower.vision_tower.encoder.s3.5.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
763
  "model.vision_tower.vision_tower.encoder.s3.5.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
764
  "model.vision_tower.vision_tower.encoder.s3.5.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
765
+ "model.vision_tower.vision_tower.encoder.s3.5.scale": "model-00006-of-00007.safetensors",
766
  "model.vision_tower.vision_tower.encoder.s4.0.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
767
  "model.vision_tower.vision_tower.encoder.s4.0.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
768
  "model.vision_tower.vision_tower.encoder.s4.0.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
 
784
  "model.vision_tower.vision_tower.encoder.s4.0.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
785
  "model.vision_tower.vision_tower.encoder.s4.0.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
786
  "model.vision_tower.vision_tower.encoder.s4.0.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
 
787
  "model.vision_tower.vision_tower.encoder.s4.0.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
788
  "model.vision_tower.vision_tower.encoder.s4.0.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
789
  "model.vision_tower.vision_tower.encoder.s4.0.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
790
  "model.vision_tower.vision_tower.encoder.s4.0.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
791
  "model.vision_tower.vision_tower.encoder.s4.0.proj.weight": "model-00006-of-00007.safetensors",
792
+ "model.vision_tower.vision_tower.encoder.s4.0.scale": "model-00006-of-00007.safetensors",
793
  "model.vision_tower.vision_tower.encoder.s4.1.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
794
  "model.vision_tower.vision_tower.encoder.s4.1.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
795
  "model.vision_tower.vision_tower.encoder.s4.1.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
 
811
  "model.vision_tower.vision_tower.encoder.s4.1.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
812
  "model.vision_tower.vision_tower.encoder.s4.1.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
813
  "model.vision_tower.vision_tower.encoder.s4.1.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
 
814
  "model.vision_tower.vision_tower.encoder.s4.1.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
815
  "model.vision_tower.vision_tower.encoder.s4.1.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
816
  "model.vision_tower.vision_tower.encoder.s4.1.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
817
+ "model.vision_tower.vision_tower.encoder.s4.1.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
818
+ "model.vision_tower.vision_tower.encoder.s4.1.scale": "model-00006-of-00007.safetensors"
819
  }
820
  }
modeling.py CHANGED
@@ -648,7 +648,7 @@ class ConvBlock(nn.Module):
648
  self.dwconv = DecompConv3D(oup, oup, kernel_size, groups=oup)
649
  self.mlp = MLP(oup, hidden_dim)
650
 
651
- self.gamma = (
652
  nn.Parameter(layer_scale_init_value * torch.ones((oup)), requires_grad=True)
653
  if layer_scale_init_value > 0
654
  else None
@@ -664,8 +664,8 @@ class ConvBlock(nn.Module):
664
 
665
  x = self.mlp(x)
666
 
667
- if self.gamma is not None:
668
- x = self.gamma * x
669
  x = x.permute(0, 4, 1, 2, 3) # (N, H, W, C) -> (N, C, H, W)
670
 
671
  x = input + self.drop_path(x)
 
648
  self.dwconv = DecompConv3D(oup, oup, kernel_size, groups=oup)
649
  self.mlp = MLP(oup, hidden_dim)
650
 
651
+ self.scale = (
652
  nn.Parameter(layer_scale_init_value * torch.ones((oup)), requires_grad=True)
653
  if layer_scale_init_value > 0
654
  else None
 
664
 
665
  x = self.mlp(x)
666
 
667
+ if self.scale is not None:
668
+ x = self.scale * x
669
  x = x.permute(0, 4, 1, 2, 3) # (N, H, W, C) -> (N, C, H, W)
670
 
671
  x = input + self.drop_path(x)