Upload folder using huggingface_hub
Browse files- config.json +2 -2
- model-00006-of-00007.safetensors +1 -1
- model.safetensors.index.json +14 -14
- modeling.py +3 -3
config.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "
|
3 |
"architectures": [
|
4 |
"VLMQwenForCausalLM"
|
5 |
],
|
@@ -57,7 +57,7 @@
|
|
57 |
"rope_theta": 1000000.0,
|
58 |
"sliding_window": null,
|
59 |
"tie_word_embeddings": false,
|
60 |
-
"torch_dtype": "
|
61 |
"transformers_version": "4.48.3",
|
62 |
"use_cache": true,
|
63 |
"use_sliding_window": false,
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "/home/yu.xin/weishao/Med3DVLM/models/Med3DVLM-Qwen-2.5-7B",
|
3 |
"architectures": [
|
4 |
"VLMQwenForCausalLM"
|
5 |
],
|
|
|
57 |
"rope_theta": 1000000.0,
|
58 |
"sliding_window": null,
|
59 |
"tie_word_embeddings": false,
|
60 |
+
"torch_dtype": "float32",
|
61 |
"transformers_version": "4.48.3",
|
62 |
"use_cache": true,
|
63 |
"use_sliding_window": false,
|
model-00006-of-00007.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 3924909896
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:af0b8f367ee0be887dc1ccd8b5c04d67935fd7eaafab5ebb8e3b3f73ddbca0af
|
3 |
size 3924909896
|
model.safetensors.index.json
CHANGED
@@ -495,12 +495,12 @@
|
|
495 |
"model.vision_tower.vision_tower.encoder.s1.0.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
496 |
"model.vision_tower.vision_tower.encoder.s1.0.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
497 |
"model.vision_tower.vision_tower.encoder.s1.0.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
498 |
-
"model.vision_tower.vision_tower.encoder.s1.0.gamma": "model-00006-of-00007.safetensors",
|
499 |
"model.vision_tower.vision_tower.encoder.s1.0.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
500 |
"model.vision_tower.vision_tower.encoder.s1.0.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
501 |
"model.vision_tower.vision_tower.encoder.s1.0.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
502 |
"model.vision_tower.vision_tower.encoder.s1.0.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
503 |
"model.vision_tower.vision_tower.encoder.s1.0.proj.weight": "model-00006-of-00007.safetensors",
|
|
|
504 |
"model.vision_tower.vision_tower.encoder.s1.1.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
505 |
"model.vision_tower.vision_tower.encoder.s1.1.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
506 |
"model.vision_tower.vision_tower.encoder.s1.1.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
@@ -522,11 +522,11 @@
|
|
522 |
"model.vision_tower.vision_tower.encoder.s1.1.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
523 |
"model.vision_tower.vision_tower.encoder.s1.1.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
524 |
"model.vision_tower.vision_tower.encoder.s1.1.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
525 |
-
"model.vision_tower.vision_tower.encoder.s1.1.gamma": "model-00006-of-00007.safetensors",
|
526 |
"model.vision_tower.vision_tower.encoder.s1.1.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
527 |
"model.vision_tower.vision_tower.encoder.s1.1.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
528 |
"model.vision_tower.vision_tower.encoder.s1.1.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
529 |
"model.vision_tower.vision_tower.encoder.s1.1.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
|
|
530 |
"model.vision_tower.vision_tower.encoder.s2.0.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
531 |
"model.vision_tower.vision_tower.encoder.s2.0.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
532 |
"model.vision_tower.vision_tower.encoder.s2.0.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
@@ -548,12 +548,12 @@
|
|
548 |
"model.vision_tower.vision_tower.encoder.s2.0.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
549 |
"model.vision_tower.vision_tower.encoder.s2.0.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
550 |
"model.vision_tower.vision_tower.encoder.s2.0.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
551 |
-
"model.vision_tower.vision_tower.encoder.s2.0.gamma": "model-00006-of-00007.safetensors",
|
552 |
"model.vision_tower.vision_tower.encoder.s2.0.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
553 |
"model.vision_tower.vision_tower.encoder.s2.0.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
554 |
"model.vision_tower.vision_tower.encoder.s2.0.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
555 |
"model.vision_tower.vision_tower.encoder.s2.0.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
556 |
"model.vision_tower.vision_tower.encoder.s2.0.proj.weight": "model-00006-of-00007.safetensors",
|
|
|
557 |
"model.vision_tower.vision_tower.encoder.s2.1.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
558 |
"model.vision_tower.vision_tower.encoder.s2.1.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
559 |
"model.vision_tower.vision_tower.encoder.s2.1.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
@@ -575,11 +575,11 @@
|
|
575 |
"model.vision_tower.vision_tower.encoder.s2.1.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
576 |
"model.vision_tower.vision_tower.encoder.s2.1.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
577 |
"model.vision_tower.vision_tower.encoder.s2.1.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
578 |
-
"model.vision_tower.vision_tower.encoder.s2.1.gamma": "model-00006-of-00007.safetensors",
|
579 |
"model.vision_tower.vision_tower.encoder.s2.1.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
580 |
"model.vision_tower.vision_tower.encoder.s2.1.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
581 |
"model.vision_tower.vision_tower.encoder.s2.1.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
582 |
"model.vision_tower.vision_tower.encoder.s2.1.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
|
|
583 |
"model.vision_tower.vision_tower.encoder.s2.2.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
584 |
"model.vision_tower.vision_tower.encoder.s2.2.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
585 |
"model.vision_tower.vision_tower.encoder.s2.2.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
@@ -601,11 +601,11 @@
|
|
601 |
"model.vision_tower.vision_tower.encoder.s2.2.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
602 |
"model.vision_tower.vision_tower.encoder.s2.2.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
603 |
"model.vision_tower.vision_tower.encoder.s2.2.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
604 |
-
"model.vision_tower.vision_tower.encoder.s2.2.gamma": "model-00006-of-00007.safetensors",
|
605 |
"model.vision_tower.vision_tower.encoder.s2.2.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
606 |
"model.vision_tower.vision_tower.encoder.s2.2.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
607 |
"model.vision_tower.vision_tower.encoder.s2.2.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
608 |
"model.vision_tower.vision_tower.encoder.s2.2.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
|
|
609 |
"model.vision_tower.vision_tower.encoder.s3.0.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
610 |
"model.vision_tower.vision_tower.encoder.s3.0.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
611 |
"model.vision_tower.vision_tower.encoder.s3.0.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
@@ -627,12 +627,12 @@
|
|
627 |
"model.vision_tower.vision_tower.encoder.s3.0.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
628 |
"model.vision_tower.vision_tower.encoder.s3.0.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
629 |
"model.vision_tower.vision_tower.encoder.s3.0.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
630 |
-
"model.vision_tower.vision_tower.encoder.s3.0.gamma": "model-00006-of-00007.safetensors",
|
631 |
"model.vision_tower.vision_tower.encoder.s3.0.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
632 |
"model.vision_tower.vision_tower.encoder.s3.0.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
633 |
"model.vision_tower.vision_tower.encoder.s3.0.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
634 |
"model.vision_tower.vision_tower.encoder.s3.0.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
635 |
"model.vision_tower.vision_tower.encoder.s3.0.proj.weight": "model-00006-of-00007.safetensors",
|
|
|
636 |
"model.vision_tower.vision_tower.encoder.s3.1.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
637 |
"model.vision_tower.vision_tower.encoder.s3.1.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
638 |
"model.vision_tower.vision_tower.encoder.s3.1.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
@@ -654,11 +654,11 @@
|
|
654 |
"model.vision_tower.vision_tower.encoder.s3.1.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
655 |
"model.vision_tower.vision_tower.encoder.s3.1.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
656 |
"model.vision_tower.vision_tower.encoder.s3.1.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
657 |
-
"model.vision_tower.vision_tower.encoder.s3.1.gamma": "model-00006-of-00007.safetensors",
|
658 |
"model.vision_tower.vision_tower.encoder.s3.1.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
659 |
"model.vision_tower.vision_tower.encoder.s3.1.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
660 |
"model.vision_tower.vision_tower.encoder.s3.1.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
661 |
"model.vision_tower.vision_tower.encoder.s3.1.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
|
|
662 |
"model.vision_tower.vision_tower.encoder.s3.2.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
663 |
"model.vision_tower.vision_tower.encoder.s3.2.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
664 |
"model.vision_tower.vision_tower.encoder.s3.2.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
@@ -680,11 +680,11 @@
|
|
680 |
"model.vision_tower.vision_tower.encoder.s3.2.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
681 |
"model.vision_tower.vision_tower.encoder.s3.2.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
682 |
"model.vision_tower.vision_tower.encoder.s3.2.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
683 |
-
"model.vision_tower.vision_tower.encoder.s3.2.gamma": "model-00006-of-00007.safetensors",
|
684 |
"model.vision_tower.vision_tower.encoder.s3.2.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
685 |
"model.vision_tower.vision_tower.encoder.s3.2.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
686 |
"model.vision_tower.vision_tower.encoder.s3.2.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
687 |
"model.vision_tower.vision_tower.encoder.s3.2.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
|
|
688 |
"model.vision_tower.vision_tower.encoder.s3.3.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
689 |
"model.vision_tower.vision_tower.encoder.s3.3.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
690 |
"model.vision_tower.vision_tower.encoder.s3.3.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
@@ -706,11 +706,11 @@
|
|
706 |
"model.vision_tower.vision_tower.encoder.s3.3.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
707 |
"model.vision_tower.vision_tower.encoder.s3.3.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
708 |
"model.vision_tower.vision_tower.encoder.s3.3.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
709 |
-
"model.vision_tower.vision_tower.encoder.s3.3.gamma": "model-00006-of-00007.safetensors",
|
710 |
"model.vision_tower.vision_tower.encoder.s3.3.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
711 |
"model.vision_tower.vision_tower.encoder.s3.3.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
712 |
"model.vision_tower.vision_tower.encoder.s3.3.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
713 |
"model.vision_tower.vision_tower.encoder.s3.3.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
|
|
714 |
"model.vision_tower.vision_tower.encoder.s3.4.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
715 |
"model.vision_tower.vision_tower.encoder.s3.4.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
716 |
"model.vision_tower.vision_tower.encoder.s3.4.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
@@ -732,11 +732,11 @@
|
|
732 |
"model.vision_tower.vision_tower.encoder.s3.4.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
733 |
"model.vision_tower.vision_tower.encoder.s3.4.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
734 |
"model.vision_tower.vision_tower.encoder.s3.4.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
735 |
-
"model.vision_tower.vision_tower.encoder.s3.4.gamma": "model-00006-of-00007.safetensors",
|
736 |
"model.vision_tower.vision_tower.encoder.s3.4.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
737 |
"model.vision_tower.vision_tower.encoder.s3.4.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
738 |
"model.vision_tower.vision_tower.encoder.s3.4.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
739 |
"model.vision_tower.vision_tower.encoder.s3.4.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
|
|
740 |
"model.vision_tower.vision_tower.encoder.s3.5.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
741 |
"model.vision_tower.vision_tower.encoder.s3.5.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
742 |
"model.vision_tower.vision_tower.encoder.s3.5.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
@@ -758,11 +758,11 @@
|
|
758 |
"model.vision_tower.vision_tower.encoder.s3.5.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
759 |
"model.vision_tower.vision_tower.encoder.s3.5.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
760 |
"model.vision_tower.vision_tower.encoder.s3.5.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
761 |
-
"model.vision_tower.vision_tower.encoder.s3.5.gamma": "model-00006-of-00007.safetensors",
|
762 |
"model.vision_tower.vision_tower.encoder.s3.5.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
763 |
"model.vision_tower.vision_tower.encoder.s3.5.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
764 |
"model.vision_tower.vision_tower.encoder.s3.5.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
765 |
"model.vision_tower.vision_tower.encoder.s3.5.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
|
|
766 |
"model.vision_tower.vision_tower.encoder.s4.0.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
767 |
"model.vision_tower.vision_tower.encoder.s4.0.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
768 |
"model.vision_tower.vision_tower.encoder.s4.0.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
@@ -784,12 +784,12 @@
|
|
784 |
"model.vision_tower.vision_tower.encoder.s4.0.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
785 |
"model.vision_tower.vision_tower.encoder.s4.0.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
786 |
"model.vision_tower.vision_tower.encoder.s4.0.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
787 |
-
"model.vision_tower.vision_tower.encoder.s4.0.gamma": "model-00006-of-00007.safetensors",
|
788 |
"model.vision_tower.vision_tower.encoder.s4.0.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
789 |
"model.vision_tower.vision_tower.encoder.s4.0.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
790 |
"model.vision_tower.vision_tower.encoder.s4.0.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
791 |
"model.vision_tower.vision_tower.encoder.s4.0.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
792 |
"model.vision_tower.vision_tower.encoder.s4.0.proj.weight": "model-00006-of-00007.safetensors",
|
|
|
793 |
"model.vision_tower.vision_tower.encoder.s4.1.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
794 |
"model.vision_tower.vision_tower.encoder.s4.1.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
795 |
"model.vision_tower.vision_tower.encoder.s4.1.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
@@ -811,10 +811,10 @@
|
|
811 |
"model.vision_tower.vision_tower.encoder.s4.1.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
812 |
"model.vision_tower.vision_tower.encoder.s4.1.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
813 |
"model.vision_tower.vision_tower.encoder.s4.1.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
814 |
-
"model.vision_tower.vision_tower.encoder.s4.1.gamma": "model-00006-of-00007.safetensors",
|
815 |
"model.vision_tower.vision_tower.encoder.s4.1.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
816 |
"model.vision_tower.vision_tower.encoder.s4.1.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
817 |
"model.vision_tower.vision_tower.encoder.s4.1.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
818 |
-
"model.vision_tower.vision_tower.encoder.s4.1.mlp.mlp.3.weight": "model-00006-of-00007.safetensors"
|
|
|
819 |
}
|
820 |
}
|
|
|
495 |
"model.vision_tower.vision_tower.encoder.s1.0.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
496 |
"model.vision_tower.vision_tower.encoder.s1.0.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
497 |
"model.vision_tower.vision_tower.encoder.s1.0.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
|
|
498 |
"model.vision_tower.vision_tower.encoder.s1.0.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
499 |
"model.vision_tower.vision_tower.encoder.s1.0.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
500 |
"model.vision_tower.vision_tower.encoder.s1.0.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
501 |
"model.vision_tower.vision_tower.encoder.s1.0.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
502 |
"model.vision_tower.vision_tower.encoder.s1.0.proj.weight": "model-00006-of-00007.safetensors",
|
503 |
+
"model.vision_tower.vision_tower.encoder.s1.0.scale": "model-00006-of-00007.safetensors",
|
504 |
"model.vision_tower.vision_tower.encoder.s1.1.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
505 |
"model.vision_tower.vision_tower.encoder.s1.1.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
506 |
"model.vision_tower.vision_tower.encoder.s1.1.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
|
|
522 |
"model.vision_tower.vision_tower.encoder.s1.1.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
523 |
"model.vision_tower.vision_tower.encoder.s1.1.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
524 |
"model.vision_tower.vision_tower.encoder.s1.1.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
|
|
525 |
"model.vision_tower.vision_tower.encoder.s1.1.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
526 |
"model.vision_tower.vision_tower.encoder.s1.1.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
527 |
"model.vision_tower.vision_tower.encoder.s1.1.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
528 |
"model.vision_tower.vision_tower.encoder.s1.1.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
529 |
+
"model.vision_tower.vision_tower.encoder.s1.1.scale": "model-00006-of-00007.safetensors",
|
530 |
"model.vision_tower.vision_tower.encoder.s2.0.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
531 |
"model.vision_tower.vision_tower.encoder.s2.0.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
532 |
"model.vision_tower.vision_tower.encoder.s2.0.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
|
|
548 |
"model.vision_tower.vision_tower.encoder.s2.0.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
549 |
"model.vision_tower.vision_tower.encoder.s2.0.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
550 |
"model.vision_tower.vision_tower.encoder.s2.0.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
|
|
551 |
"model.vision_tower.vision_tower.encoder.s2.0.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
552 |
"model.vision_tower.vision_tower.encoder.s2.0.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
553 |
"model.vision_tower.vision_tower.encoder.s2.0.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
554 |
"model.vision_tower.vision_tower.encoder.s2.0.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
555 |
"model.vision_tower.vision_tower.encoder.s2.0.proj.weight": "model-00006-of-00007.safetensors",
|
556 |
+
"model.vision_tower.vision_tower.encoder.s2.0.scale": "model-00006-of-00007.safetensors",
|
557 |
"model.vision_tower.vision_tower.encoder.s2.1.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
558 |
"model.vision_tower.vision_tower.encoder.s2.1.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
559 |
"model.vision_tower.vision_tower.encoder.s2.1.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
|
|
575 |
"model.vision_tower.vision_tower.encoder.s2.1.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
576 |
"model.vision_tower.vision_tower.encoder.s2.1.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
577 |
"model.vision_tower.vision_tower.encoder.s2.1.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
|
|
578 |
"model.vision_tower.vision_tower.encoder.s2.1.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
579 |
"model.vision_tower.vision_tower.encoder.s2.1.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
580 |
"model.vision_tower.vision_tower.encoder.s2.1.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
581 |
"model.vision_tower.vision_tower.encoder.s2.1.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
582 |
+
"model.vision_tower.vision_tower.encoder.s2.1.scale": "model-00006-of-00007.safetensors",
|
583 |
"model.vision_tower.vision_tower.encoder.s2.2.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
584 |
"model.vision_tower.vision_tower.encoder.s2.2.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
585 |
"model.vision_tower.vision_tower.encoder.s2.2.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
|
|
601 |
"model.vision_tower.vision_tower.encoder.s2.2.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
602 |
"model.vision_tower.vision_tower.encoder.s2.2.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
603 |
"model.vision_tower.vision_tower.encoder.s2.2.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
|
|
604 |
"model.vision_tower.vision_tower.encoder.s2.2.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
605 |
"model.vision_tower.vision_tower.encoder.s2.2.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
606 |
"model.vision_tower.vision_tower.encoder.s2.2.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
607 |
"model.vision_tower.vision_tower.encoder.s2.2.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
608 |
+
"model.vision_tower.vision_tower.encoder.s2.2.scale": "model-00006-of-00007.safetensors",
|
609 |
"model.vision_tower.vision_tower.encoder.s3.0.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
610 |
"model.vision_tower.vision_tower.encoder.s3.0.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
611 |
"model.vision_tower.vision_tower.encoder.s3.0.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
|
|
627 |
"model.vision_tower.vision_tower.encoder.s3.0.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
628 |
"model.vision_tower.vision_tower.encoder.s3.0.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
629 |
"model.vision_tower.vision_tower.encoder.s3.0.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
|
|
630 |
"model.vision_tower.vision_tower.encoder.s3.0.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
631 |
"model.vision_tower.vision_tower.encoder.s3.0.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
632 |
"model.vision_tower.vision_tower.encoder.s3.0.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
633 |
"model.vision_tower.vision_tower.encoder.s3.0.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
634 |
"model.vision_tower.vision_tower.encoder.s3.0.proj.weight": "model-00006-of-00007.safetensors",
|
635 |
+
"model.vision_tower.vision_tower.encoder.s3.0.scale": "model-00006-of-00007.safetensors",
|
636 |
"model.vision_tower.vision_tower.encoder.s3.1.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
637 |
"model.vision_tower.vision_tower.encoder.s3.1.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
638 |
"model.vision_tower.vision_tower.encoder.s3.1.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
|
|
654 |
"model.vision_tower.vision_tower.encoder.s3.1.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
655 |
"model.vision_tower.vision_tower.encoder.s3.1.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
656 |
"model.vision_tower.vision_tower.encoder.s3.1.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
|
|
657 |
"model.vision_tower.vision_tower.encoder.s3.1.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
658 |
"model.vision_tower.vision_tower.encoder.s3.1.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
659 |
"model.vision_tower.vision_tower.encoder.s3.1.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
660 |
"model.vision_tower.vision_tower.encoder.s3.1.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
661 |
+
"model.vision_tower.vision_tower.encoder.s3.1.scale": "model-00006-of-00007.safetensors",
|
662 |
"model.vision_tower.vision_tower.encoder.s3.2.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
663 |
"model.vision_tower.vision_tower.encoder.s3.2.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
664 |
"model.vision_tower.vision_tower.encoder.s3.2.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
|
|
680 |
"model.vision_tower.vision_tower.encoder.s3.2.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
681 |
"model.vision_tower.vision_tower.encoder.s3.2.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
682 |
"model.vision_tower.vision_tower.encoder.s3.2.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
|
|
683 |
"model.vision_tower.vision_tower.encoder.s3.2.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
684 |
"model.vision_tower.vision_tower.encoder.s3.2.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
685 |
"model.vision_tower.vision_tower.encoder.s3.2.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
686 |
"model.vision_tower.vision_tower.encoder.s3.2.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
687 |
+
"model.vision_tower.vision_tower.encoder.s3.2.scale": "model-00006-of-00007.safetensors",
|
688 |
"model.vision_tower.vision_tower.encoder.s3.3.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
689 |
"model.vision_tower.vision_tower.encoder.s3.3.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
690 |
"model.vision_tower.vision_tower.encoder.s3.3.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
|
|
706 |
"model.vision_tower.vision_tower.encoder.s3.3.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
707 |
"model.vision_tower.vision_tower.encoder.s3.3.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
708 |
"model.vision_tower.vision_tower.encoder.s3.3.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
|
|
709 |
"model.vision_tower.vision_tower.encoder.s3.3.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
710 |
"model.vision_tower.vision_tower.encoder.s3.3.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
711 |
"model.vision_tower.vision_tower.encoder.s3.3.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
712 |
"model.vision_tower.vision_tower.encoder.s3.3.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
713 |
+
"model.vision_tower.vision_tower.encoder.s3.3.scale": "model-00006-of-00007.safetensors",
|
714 |
"model.vision_tower.vision_tower.encoder.s3.4.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
715 |
"model.vision_tower.vision_tower.encoder.s3.4.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
716 |
"model.vision_tower.vision_tower.encoder.s3.4.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
|
|
732 |
"model.vision_tower.vision_tower.encoder.s3.4.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
733 |
"model.vision_tower.vision_tower.encoder.s3.4.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
734 |
"model.vision_tower.vision_tower.encoder.s3.4.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
|
|
735 |
"model.vision_tower.vision_tower.encoder.s3.4.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
736 |
"model.vision_tower.vision_tower.encoder.s3.4.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
737 |
"model.vision_tower.vision_tower.encoder.s3.4.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
738 |
"model.vision_tower.vision_tower.encoder.s3.4.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
739 |
+
"model.vision_tower.vision_tower.encoder.s3.4.scale": "model-00006-of-00007.safetensors",
|
740 |
"model.vision_tower.vision_tower.encoder.s3.5.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
741 |
"model.vision_tower.vision_tower.encoder.s3.5.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
742 |
"model.vision_tower.vision_tower.encoder.s3.5.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
|
|
758 |
"model.vision_tower.vision_tower.encoder.s3.5.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
759 |
"model.vision_tower.vision_tower.encoder.s3.5.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
760 |
"model.vision_tower.vision_tower.encoder.s3.5.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
|
|
761 |
"model.vision_tower.vision_tower.encoder.s3.5.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
762 |
"model.vision_tower.vision_tower.encoder.s3.5.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
763 |
"model.vision_tower.vision_tower.encoder.s3.5.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
764 |
"model.vision_tower.vision_tower.encoder.s3.5.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
765 |
+
"model.vision_tower.vision_tower.encoder.s3.5.scale": "model-00006-of-00007.safetensors",
|
766 |
"model.vision_tower.vision_tower.encoder.s4.0.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
767 |
"model.vision_tower.vision_tower.encoder.s4.0.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
768 |
"model.vision_tower.vision_tower.encoder.s4.0.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
|
|
784 |
"model.vision_tower.vision_tower.encoder.s4.0.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
785 |
"model.vision_tower.vision_tower.encoder.s4.0.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
786 |
"model.vision_tower.vision_tower.encoder.s4.0.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
|
|
787 |
"model.vision_tower.vision_tower.encoder.s4.0.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
788 |
"model.vision_tower.vision_tower.encoder.s4.0.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
789 |
"model.vision_tower.vision_tower.encoder.s4.0.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
790 |
"model.vision_tower.vision_tower.encoder.s4.0.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
791 |
"model.vision_tower.vision_tower.encoder.s4.0.proj.weight": "model-00006-of-00007.safetensors",
|
792 |
+
"model.vision_tower.vision_tower.encoder.s4.0.scale": "model-00006-of-00007.safetensors",
|
793 |
"model.vision_tower.vision_tower.encoder.s4.1.dwconv.c1.0.bias": "model-00006-of-00007.safetensors",
|
794 |
"model.vision_tower.vision_tower.encoder.s4.1.dwconv.c1.0.weight": "model-00006-of-00007.safetensors",
|
795 |
"model.vision_tower.vision_tower.encoder.s4.1.dwconv.c1.1.bias": "model-00006-of-00007.safetensors",
|
|
|
811 |
"model.vision_tower.vision_tower.encoder.s4.1.dwconv.c3.1.running_mean": "model-00006-of-00007.safetensors",
|
812 |
"model.vision_tower.vision_tower.encoder.s4.1.dwconv.c3.1.running_var": "model-00006-of-00007.safetensors",
|
813 |
"model.vision_tower.vision_tower.encoder.s4.1.dwconv.c3.1.weight": "model-00006-of-00007.safetensors",
|
|
|
814 |
"model.vision_tower.vision_tower.encoder.s4.1.mlp.mlp.0.bias": "model-00006-of-00007.safetensors",
|
815 |
"model.vision_tower.vision_tower.encoder.s4.1.mlp.mlp.0.weight": "model-00006-of-00007.safetensors",
|
816 |
"model.vision_tower.vision_tower.encoder.s4.1.mlp.mlp.3.bias": "model-00006-of-00007.safetensors",
|
817 |
+
"model.vision_tower.vision_tower.encoder.s4.1.mlp.mlp.3.weight": "model-00006-of-00007.safetensors",
|
818 |
+
"model.vision_tower.vision_tower.encoder.s4.1.scale": "model-00006-of-00007.safetensors"
|
819 |
}
|
820 |
}
|
modeling.py
CHANGED
@@ -648,7 +648,7 @@ class ConvBlock(nn.Module):
|
|
648 |
self.dwconv = DecompConv3D(oup, oup, kernel_size, groups=oup)
|
649 |
self.mlp = MLP(oup, hidden_dim)
|
650 |
|
651 |
-
self.
|
652 |
nn.Parameter(layer_scale_init_value * torch.ones((oup)), requires_grad=True)
|
653 |
if layer_scale_init_value > 0
|
654 |
else None
|
@@ -664,8 +664,8 @@ class ConvBlock(nn.Module):
|
|
664 |
|
665 |
x = self.mlp(x)
|
666 |
|
667 |
-
if self.
|
668 |
-
x = self.
|
669 |
x = x.permute(0, 4, 1, 2, 3) # (N, H, W, C) -> (N, C, H, W)
|
670 |
|
671 |
x = input + self.drop_path(x)
|
|
|
648 |
self.dwconv = DecompConv3D(oup, oup, kernel_size, groups=oup)
|
649 |
self.mlp = MLP(oup, hidden_dim)
|
650 |
|
651 |
+
self.scale = (
|
652 |
nn.Parameter(layer_scale_init_value * torch.ones((oup)), requires_grad=True)
|
653 |
if layer_scale_init_value > 0
|
654 |
else None
|
|
|
664 |
|
665 |
x = self.mlp(x)
|
666 |
|
667 |
+
if self.scale is not None:
|
668 |
+
x = self.scale * x
|
669 |
x = x.permute(0, 4, 1, 2, 3) # (N, H, W, C) -> (N, C, H, W)
|
670 |
|
671 |
x = input + self.drop_path(x)
|