niobures commited on
Commit
c3dd1c2
·
verified ·
1 Parent(s): 182154c

LaViLa (EK-100_MIR), Llava-v1.5-7B-GGUF, MobileVLM, Qwen2-VL-2B-Instruct, YOLO v8n, YOLO World

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +19 -0
  2. LLaVA/Llava-v1.5-7B-GGUF/llava-v1.5-7b-Q2_K.gguf +3 -0
  3. LLaVA/Llava-v1.5-7B-GGUF/llava-v1.5-7b-mmproj-model-f16.gguf +3 -0
  4. LaViLa/EK-100_MIR/TSF-B/clip_openai_timesformer_base.ft_ek100_mir.ep_0085.md5sum_c67d95.pth +3 -0
  5. MobileVLM/MobileVLM-1.7B-GGUF/.gitattributes +39 -0
  6. MobileVLM/MobileVLM-1.7B-GGUF/MobileVLM-1.7B-Q4_K.gguf +3 -0
  7. MobileVLM/MobileVLM-1.7B-GGUF/MobileVLM-1.7B-Q5_K.gguf +3 -0
  8. MobileVLM/MobileVLM-1.7B-GGUF/MobileVLM-1.7B-Q6_K.gguf +3 -0
  9. MobileVLM/MobileVLM-1.7B-GGUF/MobileVLM-1.7B-mmproj-f16.gguf +3 -0
  10. MobileVLM/MobileVLM_V2-1.7B-GGUF/.gitattributes +37 -0
  11. MobileVLM/MobileVLM_V2-1.7B-GGUF/README.md +16 -0
  12. MobileVLM/MobileVLM_V2-1.7B-GGUF/ggml-model-q4_k.gguf +3 -0
  13. MobileVLM/MobileVLM_V2-1.7B-GGUF/mmproj-model-f16.gguf +3 -0
  14. Qwen2-VL/Qwen2-VL-2B-Instruct-GGUF/.gitattributes +41 -0
  15. Qwen2-VL/Qwen2-VL-2B-Instruct-GGUF/Qwen2-VL-2B-Instruct-Q3_K_L.gguf +3 -0
  16. Qwen2-VL/Qwen2-VL-2B-Instruct-GGUF/Qwen2-VL-2B-Instruct-Q4_K_M.gguf +3 -0
  17. Qwen2-VL/Qwen2-VL-2B-Instruct-GGUF/Qwen2-VL-2B-Instruct-Q6_K.gguf +3 -0
  18. Qwen2-VL/Qwen2-VL-2B-Instruct-GGUF/Qwen2-VL-2B-Instruct-Q8_0.gguf +3 -0
  19. Qwen2-VL/Qwen2-VL-2B-Instruct-GGUF/README.md +29 -0
  20. Qwen2-VL/Qwen2-VL-2B-Instruct-GGUF/mmproj-model-f32.gguf +3 -0
  21. YOLO/yolo-world/yolo-world-s.pt +3 -0
  22. YOLO/yolov8n_silu_coco_640x640_quant_tflite_edgetpu_1/labels_yolov8n_silu_coco.json +82 -0
  23. YOLO/yolov8n_silu_coco_640x640_quant_tflite_edgetpu_1/yolov8n_silu_coco_640x640_quant_tflite_edgetpu_1.json +67 -0
  24. YOLO/yolov8n_silu_coco_640x640_quant_tflite_edgetpu_1/yolov8n_silu_coco_640x640_quant_tflite_edgetpu_1.tflite +3 -0
  25. onnx-community/Qwen2-VL-2B-Instruct/.gitattributes +41 -0
  26. onnx-community/Qwen2-VL-2B-Instruct/README.md +303 -0
  27. onnx-community/Qwen2-VL-2B-Instruct/added_tokens.json +16 -0
  28. onnx-community/Qwen2-VL-2B-Instruct/chat_template.json +3 -0
  29. onnx-community/Qwen2-VL-2B-Instruct/config.json +56 -0
  30. onnx-community/Qwen2-VL-2B-Instruct/generation_config.json +13 -0
  31. onnx-community/Qwen2-VL-2B-Instruct/merges.txt +0 -0
  32. onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged.onnx +3 -0
  33. onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged.onnx_data +3 -0
  34. onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged_bnb4.onnx +3 -0
  35. onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged_fp16.onnx +3 -0
  36. onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged_fp16.onnx_data +3 -0
  37. onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged_int8.onnx +3 -0
  38. onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged_q4.onnx +3 -0
  39. onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged_q4f16.onnx +3 -0
  40. onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged_quantized.onnx +3 -0
  41. onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged_uint8.onnx +3 -0
  42. onnx-community/Qwen2-VL-2B-Instruct/onnx/embed_tokens.onnx +3 -0
  43. onnx-community/Qwen2-VL-2B-Instruct/onnx/embed_tokens_bnb4.onnx +3 -0
  44. onnx-community/Qwen2-VL-2B-Instruct/onnx/embed_tokens_fp16.onnx +3 -0
  45. onnx-community/Qwen2-VL-2B-Instruct/onnx/embed_tokens_int8.onnx +3 -0
  46. onnx-community/Qwen2-VL-2B-Instruct/onnx/embed_tokens_q4.onnx +3 -0
  47. onnx-community/Qwen2-VL-2B-Instruct/onnx/embed_tokens_q4f16.onnx +3 -0
  48. onnx-community/Qwen2-VL-2B-Instruct/onnx/embed_tokens_quantized.onnx +3 -0
  49. onnx-community/Qwen2-VL-2B-Instruct/onnx/embed_tokens_uint8.onnx +3 -0
  50. onnx-community/Qwen2-VL-2B-Instruct/onnx/vision_encoder.onnx +3 -0
.gitattributes CHANGED
@@ -33,3 +33,22 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ LLaVA/Llava-v1.5-7B-GGUF/llava-v1.5-7b-mmproj-model-f16.gguf filter=lfs diff=lfs merge=lfs -text
37
+ LLaVA/Llava-v1.5-7B-GGUF/llava-v1.5-7b-Q2_K.gguf filter=lfs diff=lfs merge=lfs -text
38
+ MobileVLM/MobileVLM_V2-1.7B-GGUF/ggml-model-q4_k.gguf filter=lfs diff=lfs merge=lfs -text
39
+ MobileVLM/MobileVLM_V2-1.7B-GGUF/mmproj-model-f16.gguf filter=lfs diff=lfs merge=lfs -text
40
+ MobileVLM/MobileVLM-1.7B-GGUF/MobileVLM-1.7B-mmproj-f16.gguf filter=lfs diff=lfs merge=lfs -text
41
+ MobileVLM/MobileVLM-1.7B-GGUF/MobileVLM-1.7B-Q4_K.gguf filter=lfs diff=lfs merge=lfs -text
42
+ MobileVLM/MobileVLM-1.7B-GGUF/MobileVLM-1.7B-Q5_K.gguf filter=lfs diff=lfs merge=lfs -text
43
+ MobileVLM/MobileVLM-1.7B-GGUF/MobileVLM-1.7B-Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
44
+ onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged_fp16.onnx_data filter=lfs diff=lfs merge=lfs -text
45
+ onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged.onnx_data filter=lfs diff=lfs merge=lfs -text
46
+ onnx-community/Qwen2-VL-2B-Instruct/onnx/vision_encoder_bnb4.onnx_data filter=lfs diff=lfs merge=lfs -text
47
+ onnx-community/Qwen2-VL-2B-Instruct/onnx/vision_encoder_q4.onnx_data filter=lfs diff=lfs merge=lfs -text
48
+ onnx-community/Qwen2-VL-2B-Instruct/onnx/vision_encoder.onnx_data filter=lfs diff=lfs merge=lfs -text
49
+ onnx-community/Qwen2-VL-2B-Instruct/tokenizer.json filter=lfs diff=lfs merge=lfs -text
50
+ Qwen2-VL/Qwen2-VL-2B-Instruct-GGUF/mmproj-model-f32.gguf filter=lfs diff=lfs merge=lfs -text
51
+ Qwen2-VL/Qwen2-VL-2B-Instruct-GGUF/Qwen2-VL-2B-Instruct-Q3_K_L.gguf filter=lfs diff=lfs merge=lfs -text
52
+ Qwen2-VL/Qwen2-VL-2B-Instruct-GGUF/Qwen2-VL-2B-Instruct-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
53
+ Qwen2-VL/Qwen2-VL-2B-Instruct-GGUF/Qwen2-VL-2B-Instruct-Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
54
+ Qwen2-VL/Qwen2-VL-2B-Instruct-GGUF/Qwen2-VL-2B-Instruct-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
LLaVA/Llava-v1.5-7B-GGUF/llava-v1.5-7b-Q2_K.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dde114456d7b4ea5cc07c75e363f9807db1dcf82f61168cc91969316601ca467
3
+ size 2532863776
LLaVA/Llava-v1.5-7B-GGUF/llava-v1.5-7b-mmproj-model-f16.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50da4e5b0a011615f77686f9b02613571e65d23083c225e107c08c3b1775d9b1
3
+ size 624434368
LaViLa/EK-100_MIR/TSF-B/clip_openai_timesformer_base.ft_ek100_mir.ep_0085.md5sum_c67d95.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17e8302ae99b607459b7c01480bf85a7ba664b8afc91b0bbf6a9be662887c91d
3
+ size 710829395
MobileVLM/MobileVLM-1.7B-GGUF/.gitattributes ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ MobileVLM-1.7B-mmproj-f16.gguf filter=lfs diff=lfs merge=lfs -text
37
+ MobileVLM-1.7B-Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
38
+ MobileVLM-1.7B-Q4_K.gguf filter=lfs diff=lfs merge=lfs -text
39
+ MobileVLM-1.7B-Q5_K.gguf filter=lfs diff=lfs merge=lfs -text
MobileVLM/MobileVLM-1.7B-GGUF/MobileVLM-1.7B-Q4_K.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41f0487ea4d3a4f58c467d779bfe66c02d9b33fa2d33b85cf456df89eb35dab9
3
+ size 834055776
MobileVLM/MobileVLM-1.7B-GGUF/MobileVLM-1.7B-Q5_K.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ca45d185225a61c0b003b7a4fe072ddae9ea57bbda7e7025e7dec50421192b8
3
+ size 972795488
MobileVLM/MobileVLM-1.7B-GGUF/MobileVLM-1.7B-Q6_K.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73ee20a00293a6419e311c349f7dc257ce0a15e9e59962933890204a0fcc0881
3
+ size 1120206432
MobileVLM/MobileVLM-1.7B-GGUF/MobileVLM-1.7B-mmproj-f16.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d9855d323cee2a1797a88f9d7057ce26b21dcd62a50b382c4ff44ea60c77e39
3
+ size 620384896
MobileVLM/MobileVLM_V2-1.7B-GGUF/.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ ggml-model-q4_k.gguf filter=lfs diff=lfs merge=lfs -text
37
+ mmproj-model-f16.gguf filter=lfs diff=lfs merge=lfs -text
MobileVLM/MobileVLM_V2-1.7B-GGUF/README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - MobileVLM V2
5
+ ---
6
+ ## Model Summery
7
+ MobileVLM V2 is a family of significantly improved vision language models upon MobileVLM, which proves that a delicate orchestration of novel architectural design, an improved training scheme tailored for mobile VLMs, and rich high-quality dataset curation can substantially benefit VLMs’ performance. Specifically, MobileVLM V2 1.7B achieves better or on-par performance on standard VLM benchmarks compared with much larger VLMs at the 3B scale. Notably, MobileVLM_V2-3B model outperforms a large variety of VLMs at the 7B+ scale.
8
+
9
+ The MobileVLM_V2-1.7B was built on our [MobileLLaMA-1.4B-Chat](](https://huggingface.co/mtgv/MobileLLaMA-1.4B-Chat)) to facilitate the off-the-shelf deployment.
10
+
11
+ ## Model Sources
12
+ - Repository: https://github.com/Meituan-AutoML/MobileVLM
13
+ - Paper: [MobileVLM V2: Faster and Stronger Baseline for Vision Language Model](https://arxiv.org/abs/2402.03766)
14
+
15
+ ## How to Get Started with the Model
16
+ Inference examples can be found at [Github](https://github.com/Meituan-AutoML/MobileVLM).
MobileVLM/MobileVLM_V2-1.7B-GGUF/ggml-model-q4_k.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15d4bd09293404831902c23dd898aa2cc7b4b223b6c39a64e330601ef72d99db
3
+ size 791817856
MobileVLM/MobileVLM_V2-1.7B-GGUF/mmproj-model-f16.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57966afa654e9d46a11b2a4b17989c2d487cd961f702c4fe310f86db5e30aab4
3
+ size 595103072
Qwen2-VL/Qwen2-VL-2B-Instruct-GGUF/.gitattributes ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Qwen2-VL-2B-Instruct-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
37
+ Qwen2-VL-2B-Instruct-Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
38
+ Qwen2-VL-2B-Instruct-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
39
+ Qwen2-VL-2B-Instruct-Q3_K_L.gguf filter=lfs diff=lfs merge=lfs -text
40
+ mmproj-Qwen2-VL-2B-Instruct-f32.gguf filter=lfs diff=lfs merge=lfs -text
41
+ mmproj-model-f32.gguf filter=lfs diff=lfs merge=lfs -text
Qwen2-VL/Qwen2-VL-2B-Instruct-GGUF/Qwen2-VL-2B-Instruct-Q3_K_L.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:feb289177bcb04bec880720eb7a10890148f8a6586d911b921a301f55d632807
3
+ size 880161248
Qwen2-VL/Qwen2-VL-2B-Instruct-GGUF/Qwen2-VL-2B-Instruct-Q4_K_M.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20868587821c6b9f82089daac35918f8150c3568e3c4a2cdd51a24b6dd75ab79
3
+ size 986046944
Qwen2-VL/Qwen2-VL-2B-Instruct-GGUF/Qwen2-VL-2B-Instruct-Q6_K.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7505e951bbeda59f36e0ff413f5a76fec9281f640613e549e0f4398329cf631
3
+ size 1272738272
Qwen2-VL/Qwen2-VL-2B-Instruct-GGUF/Qwen2-VL-2B-Instruct-Q8_0.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c11dff3f74b326668600607a04b412c320d9dfe917a77f48ed6abbd962dd4a44
3
+ size 1646571488
Qwen2-VL/Qwen2-VL-2B-Instruct-GGUF/README.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ quantized_by: bartowski
3
+ pipeline_tag: text-generation
4
+ ---
5
+ ## 💫 Community Model> Qwen2 VL 2B Instruct by Qwen
6
+
7
+ *👾 [LM Studio](https://lmstudio.ai) Community models highlights program. Highlighting new & noteworthy models by the community. Join the conversation on [Discord](https://discord.gg/aPQfnNkxGC)*.
8
+
9
+ **Model creator:** [Qwen](https://huggingface.co/Qwen)<br>
10
+ **Original model**: [Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)<br>
11
+ **GGUF quantization:** provided by [bartowski](https://huggingface.co/bartowski) based on `llama.cpp` release [b4327](https://github.com/ggerganov/llama.cpp/releases/tag/b4327)<br>
12
+
13
+ ## Technical Details
14
+
15
+ Supports context length of 32k tokens.
16
+
17
+ Vision model capable of understanding images of various resolutions and ratios.
18
+
19
+ Complex reasoning for agentic automation with vision.
20
+
21
+ Multilingual support.
22
+
23
+ ## Special thanks
24
+
25
+ 🙏 Special thanks to [Georgi Gerganov](https://github.com/ggerganov) and the whole team working on [llama.cpp](https://github.com/ggerganov/llama.cpp/) for making all of this possible.
26
+
27
+ ## Disclaimers
28
+
29
+ LM Studio is not the creator, originator, or owner of any Model featured in the Community Model Program. Each Community Model is created and provided by third parties. LM Studio does not endorse, support, represent or guarantee the completeness, truthfulness, accuracy, or reliability of any Community Model. You understand that Community Models can produce content that might be offensive, harmful, inaccurate or otherwise inappropriate, or deceptive. Each Community Model is the sole responsibility of the person or entity who originated such Model. LM Studio may not monitor or control the Community Models and cannot, and does not, take responsibility for any such Model. LM Studio disclaims all warranties or guarantees about the accuracy, reliability or benefits of the Community Models. LM Studio further disclaims any warranty that the Community Model will meet your requirements, be secure, uninterrupted or available at any time or location, or error-free, viruses-free, or that any errors will be corrected, or otherwise. You will be solely responsible for any damage resulting from your use of or access to the Community Models, your downloading of any Community Model, or use of any other Community Model provided by or through LM Studio.
Qwen2-VL/Qwen2-VL-2B-Instruct-GGUF/mmproj-model-f32.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b5a4e3b50652c60f7a027fb113d7e9d8f9411b0702f4de4e0743f71a3355530
3
+ size 2661115392
YOLO/yolo-world/yolo-world-s.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b36b186fa4279efabade126b2ccce57008f36523e41bf344a6fd155e49a0368
3
+ size 27166882
YOLO/yolov8n_silu_coco_640x640_quant_tflite_edgetpu_1/labels_yolov8n_silu_coco.json ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0": "person",
3
+ "1": "bicycle",
4
+ "2": "car",
5
+ "3": "motorcycle",
6
+ "4": "airplane",
7
+ "5": "bus",
8
+ "6": "train",
9
+ "7": "truck",
10
+ "8": "boat",
11
+ "9": "traffic light",
12
+ "10": "fire hydrant",
13
+ "11": "stop sign",
14
+ "12": "parking meter",
15
+ "13": "bench",
16
+ "14": "bird",
17
+ "15": "cat",
18
+ "16": "dog",
19
+ "17": "horse",
20
+ "18": "sheep",
21
+ "19": "cow",
22
+ "20": "elephant",
23
+ "21": "bear",
24
+ "22": "zebra",
25
+ "23": "giraffe",
26
+ "24": "backpack",
27
+ "25": "umbrella",
28
+ "26": "handbag",
29
+ "27": "tie",
30
+ "28": "suitcase",
31
+ "29": "frisbee",
32
+ "30": "skis",
33
+ "31": "snowboard",
34
+ "32": "sports ball",
35
+ "33": "kite",
36
+ "34": "baseball bat",
37
+ "35": "baseball glove",
38
+ "36": "skateboard",
39
+ "37": "surfboard",
40
+ "38": "tennis racket",
41
+ "39": "bottle",
42
+ "40": "wine glass",
43
+ "41": "cup",
44
+ "42": "fork",
45
+ "43": "knife",
46
+ "44": "spoon",
47
+ "45": "bowl",
48
+ "46": "banana",
49
+ "47": "apple",
50
+ "48": "sandwich",
51
+ "49": "orange",
52
+ "50": "broccoli",
53
+ "51": "carrot",
54
+ "52": "hot dog",
55
+ "53": "pizza",
56
+ "54": "donut",
57
+ "55": "cake",
58
+ "56": "chair",
59
+ "57": "couch",
60
+ "58": "potted plant",
61
+ "59": "bed",
62
+ "60": "dining table",
63
+ "61": "toilet",
64
+ "62": "tv",
65
+ "63": "laptop",
66
+ "64": "mouse",
67
+ "65": "remote",
68
+ "66": "keyboard",
69
+ "67": "cell phone",
70
+ "68": "microwave",
71
+ "69": "oven",
72
+ "70": "toaster",
73
+ "71": "sink",
74
+ "72": "refrigerator",
75
+ "73": "book",
76
+ "74": "clock",
77
+ "75": "vase",
78
+ "76": "scissors",
79
+ "77": "teddy bear",
80
+ "78": "hair drier",
81
+ "79": "toothbrush"
82
+ }
YOLO/yolov8n_silu_coco_640x640_quant_tflite_edgetpu_1/yolov8n_silu_coco_640x640_quant_tflite_edgetpu_1.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ConfigVersion": 6,
3
+ "Checksum": "1b27ef9c2ceea55429dd1c98a13510e8597495a62bf5d1bf74e72b9bad883502",
4
+ "DEVICE": [
5
+ {
6
+ "DeviceType": "EDGETPU",
7
+ "RuntimeAgent": "TFLITE",
8
+ "SupportedDeviceTypes": "TFLITE/EDGETPU"
9
+ }
10
+ ],
11
+ "PRE_PROCESS": [
12
+ {
13
+ "InputN": 1,
14
+ "InputType": "Image",
15
+ "InputResizeMethod": "bilinear",
16
+ "InputPadMethod": "letterbox",
17
+ "ImageBackend": "auto",
18
+ "InputH": 640,
19
+ "InputW": 640,
20
+ "InputC": 3,
21
+ "InputQuantEn": true,
22
+ "InputQuantOffset": 0,
23
+ "InputQuantScale": 0.00392156862745098,
24
+ "InputImgNormEn": true,
25
+ "InputImgNormCoeff": 0.00392156862745098,
26
+ "InputNormMean": [
27
+ 0,
28
+ 0,
29
+ 0
30
+ ],
31
+ "InputNormStd": [
32
+ 1,
33
+ 1,
34
+ 1
35
+ ],
36
+ "InputTensorLayout": "NHWC",
37
+ "InputImgSliceType": "SLICE2"
38
+ }
39
+ ],
40
+ "MODEL_PARAMETERS": [
41
+ {
42
+ "ModelPath": "yolov8n_silu_coco--640x640_quant_tflite_edgetpu_1.tflite"
43
+ }
44
+ ],
45
+ "POST_PROCESS": [
46
+ {
47
+ "OutputPostprocessType": "DetectionYoloV8",
48
+ "PostProcessorInputs": [
49
+ 220,
50
+ 221,
51
+ 225,
52
+ 222,
53
+ 224,
54
+ 223
55
+ ],
56
+ "OutputNumClasses": 80,
57
+ "LabelsPath": "labels_yolov8n_silu_coco.json",
58
+ "OutputClassIDAdjustment": 0,
59
+ "OutputNMSThreshold": 0.6,
60
+ "MaxDetectionsPerClass": 100,
61
+ "MaxClassesPerDetection": 1,
62
+ "UseRegularNMS": true,
63
+ "OutputConfThreshold": 0.3,
64
+ "MaxDetections": 100
65
+ }
66
+ ]
67
+ }
YOLO/yolov8n_silu_coco_640x640_quant_tflite_edgetpu_1/yolov8n_silu_coco_640x640_quant_tflite_edgetpu_1.tflite ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8023850f1b629752acd65c34cc6f3bdfe1e8d7985327e4ae6081a3b20346415e
3
+ size 3481056
onnx-community/Qwen2-VL-2B-Instruct/.gitattributes ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ onnx/decoder_model_merged.onnx_data filter=lfs diff=lfs merge=lfs -text
37
+ onnx/decoder_model_merged_fp16.onnx_data filter=lfs diff=lfs merge=lfs -text
38
+ onnx/vision_encoder.onnx_data filter=lfs diff=lfs merge=lfs -text
39
+ onnx/vision_encoder_bnb4.onnx_data filter=lfs diff=lfs merge=lfs -text
40
+ onnx/vision_encoder_q4.onnx_data filter=lfs diff=lfs merge=lfs -text
41
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
onnx-community/Qwen2-VL-2B-Instruct/README.md ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: transformers.js
4
+ base_model: Qwen/Qwen2-VL-2B-Instruct
5
+ ---
6
+
7
+ https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct with ONNX weights to be compatible with Transformers.js.
8
+
9
+ ## Usage (Transformers.js)
10
+
11
+ If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@huggingface/transformers) using:
12
+ ```bash
13
+ npm i @huggingface/transformers
14
+ ```
15
+
16
+ **Example:** Image+text to text
17
+
18
+ ```js
19
+ import { AutoProcessor, Qwen2VLForConditionalGeneration, RawImage } from "@huggingface/transformers";
20
+
21
+ // Load processor and model
22
+ const model_id = "onnx-community/Qwen2-VL-2B-Instruct";
23
+ const processor = await AutoProcessor.from_pretrained(model_id);
24
+ const model = await Qwen2VLForConditionalGeneration.from_pretrained(model_id);
25
+
26
+ // Prepare inputs
27
+ const url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg";
28
+ const image = await (await RawImage.read(url)).resize(448, 448);
29
+ const conversation = [
30
+ {
31
+ role: "user",
32
+ content: [
33
+ { type: "image" },
34
+ { type: "text", text: "Describe this image." },
35
+ ],
36
+ },
37
+ ];
38
+ const text = processor.apply_chat_template(conversation, { add_generation_prompt: true });
39
+ const inputs = await processor(text, image);
40
+
41
+ // Perform inference
42
+ const outputs = await model.generate({
43
+ ...inputs,
44
+ max_new_tokens: 128,
45
+ });
46
+
47
+ // Decode output
48
+ const decoded = processor.batch_decode(
49
+ outputs.slice(null, [inputs.input_ids.dims.at(-1), null]),
50
+ { skip_special_tokens: true },
51
+ );
52
+ console.log(decoded[0]);
53
+ // The image depicts a serene beach scene with a woman and a dog. The woman is sitting on the sand, wearing a plaid shirt, and appears to be engaged in a playful interaction with the dog. The dog, which is a large breed, is sitting on its hind legs and appears to be reaching out to the woman, possibly to give her a high-five or a paw. The background shows the ocean with gentle waves, and the sky is clear, suggesting it might be either sunrise or sunset. The overall atmosphere is calm and relaxed, capturing a moment of connection between the woman and the dog.
54
+ ```
55
+
56
+ ## ONNX conversion script:
57
+ First, install the following dependencies:
58
+ ```sh
59
+ pip install --upgrade git+https://github.com/huggingface/transformers.git onnx==1.17.0 onnxruntime==1.20.1 optimum==1.23.3 onnxslim==0.1.42
60
+ ```
61
+
62
+ ```py
63
+ import os
64
+ import torch
65
+ from transformers import (
66
+ AutoProcessor,
67
+ Qwen2VLForConditionalGeneration,
68
+ DynamicCache,
69
+ )
70
+
71
+
72
+ class PatchedQwen2VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
73
+ def forward(self, *args):
74
+ inputs_embeds, attention_mask, position_ids, *past_key_values_args = args
75
+
76
+ # Convert past_key_values list to DynamicCache
77
+ if len(past_key_values_args) == 0:
78
+ past_key_values = None
79
+ else:
80
+ past_key_values = DynamicCache(self.config.num_hidden_layers)
81
+ for i in range(self.config.num_hidden_layers):
82
+ key = past_key_values_args.pop(0)
83
+ value = past_key_values_args.pop(0)
84
+ past_key_values.update(key_states=key, value_states=value, layer_idx=i)
85
+
86
+ o = super().forward(
87
+ inputs_embeds=inputs_embeds,
88
+ attention_mask=attention_mask,
89
+ position_ids=position_ids,
90
+ past_key_values=past_key_values,
91
+ )
92
+
93
+ flattened_past_key_values_outputs = {
94
+ "logits": o.logits,
95
+ }
96
+ output_past_key_values: DynamicCache = o.past_key_values
97
+ for i, (key, value) in enumerate(
98
+ zip(output_past_key_values.key_cache, output_past_key_values.value_cache)
99
+ ):
100
+ flattened_past_key_values_outputs[f"present.{i}.key"] = key
101
+ flattened_past_key_values_outputs[f"present.{i}.value"] = value
102
+
103
+ return flattened_past_key_values_outputs
104
+
105
+
106
+ # Constants
107
+ OUTPUT_FOLDER = "output"
108
+ EMBEDDING_MODEL_NAME = "embed_tokens.onnx"
109
+ TEXT_MODEL_NAME = "decoder_model_merged.onnx"
110
+ VISION_MODEL_NAME = "vision_encoder.onnx"
111
+ TEMP_MODEL_OUTPUT_FOLDER = os.path.join(OUTPUT_FOLDER, "temp")
112
+ FINAL_MODEL_OUTPUT_FOLDER = os.path.join(OUTPUT_FOLDER, "onnx")
113
+
114
+
115
+ # Load model and processor
116
+ model_id = "Qwen/Qwen2-VL-2B-Instruct"
117
+ model = PatchedQwen2VLForConditionalGeneration.from_pretrained(model_id).eval()
118
+ processor = AutoProcessor.from_pretrained(model_id)
119
+
120
+
121
+ # Save model configs and processor
122
+ model.config.save_pretrained(OUTPUT_FOLDER)
123
+ model.generation_config.save_pretrained(OUTPUT_FOLDER)
124
+ processor.save_pretrained(OUTPUT_FOLDER)
125
+ os.makedirs(TEMP_MODEL_OUTPUT_FOLDER, exist_ok=True)
126
+
127
+
128
+ # Configuration values
129
+ ## Text model
130
+ text_config = model.config
131
+ num_heads = text_config.num_attention_heads
132
+ num_key_value_heads = text_config.num_key_value_heads
133
+ head_dim = text_config.hidden_size // num_heads
134
+ num_layers = text_config.num_hidden_layers
135
+ hidden_size = text_config.hidden_size
136
+
137
+ ## Vision model
138
+ vision_config = model.config.vision_config
139
+ channel = vision_config.in_chans
140
+ temporal_patch_size = vision_config.temporal_patch_size
141
+ patch_size = vision_config.spatial_patch_size
142
+
143
+
144
+ # Dummy input sizes
145
+ grid_t, grid_h, grid_w = [1, 16, 16]
146
+ batch_size = 1
147
+ sequence_length = 16
148
+ num_channels = 3
149
+ past_sequence_length = 0
150
+
151
+ image_batch_size = 1 # TODO: Add support for > 1 images
152
+ assert image_batch_size == 1
153
+
154
+
155
+ # Dummy inputs
156
+ ## Embedding inputs
157
+ input_ids = torch.randint(
158
+ 0, model.config.vocab_size, (batch_size, sequence_length), dtype=torch.int64
159
+ )
160
+
161
+ ## Text inputs
162
+ dummy_past_key_values_kwargs = {
163
+ f"past_key_values.{i}.{key}": torch.zeros(
164
+ batch_size,
165
+ num_key_value_heads,
166
+ past_sequence_length,
167
+ head_dim,
168
+ dtype=torch.float32,
169
+ )
170
+ for i in range(num_layers)
171
+ for key in ["key", "value"]
172
+ }
173
+ inputs_embeds = torch.ones(
174
+ batch_size, sequence_length, hidden_size, dtype=torch.float32
175
+ )
176
+ attention_mask = torch.ones(batch_size, sequence_length, dtype=torch.int64)
177
+ position_ids = torch.ones(3, batch_size, sequence_length, dtype=torch.int64)
178
+
179
+ ## Vision inputs
180
+ grid_thw = torch.tensor(
181
+ [[grid_t, grid_h, grid_w]] * image_batch_size, dtype=torch.int64
182
+ )
183
+ pixel_values = torch.randn(
184
+ image_batch_size * grid_t * grid_h * grid_w,
185
+ channel * temporal_patch_size * patch_size * patch_size,
186
+ dtype=torch.float32,
187
+ )
188
+
189
+
190
+ # ONNX Exports
191
+ ## Embedding model
192
+ embedding_inputs = dict(input_ids=input_ids)
193
+ embedding_inputs_positional = tuple(embedding_inputs.values())
194
+ model.model.embed_tokens(*embedding_inputs_positional) # Test forward pass
195
+ EMBED_TOKENS_OUTPUT_PATH = os.path.join(TEMP_MODEL_OUTPUT_FOLDER, EMBEDDING_MODEL_NAME)
196
+ torch.onnx.export(
197
+ model.model.embed_tokens,
198
+ args=embedding_inputs_positional,
199
+ f=EMBED_TOKENS_OUTPUT_PATH,
200
+ export_params=True,
201
+ opset_version=14,
202
+ do_constant_folding=True,
203
+ input_names=list(embedding_inputs.keys()),
204
+ output_names=["inputs_embeds"],
205
+ dynamic_axes={
206
+ "input_ids": {0: "batch_size", 1: "sequence_length"},
207
+ "inputs_embeds": {0: "batch_size", 1: "sequence_length"},
208
+ },
209
+ )
210
+
211
+ ## Text model
212
+ text_inputs = dict(
213
+ inputs_embeds=inputs_embeds,
214
+ attention_mask=attention_mask,
215
+ position_ids=position_ids,
216
+ **dummy_past_key_values_kwargs,
217
+ )
218
+ text_inputs_positional = tuple(text_inputs.values())
219
+ text_outputs = model.forward(*text_inputs_positional) # Test forward pass
220
+ TEXT_MODEL_OUTPUT_PATH=os.path.join(TEMP_MODEL_OUTPUT_FOLDER, TEXT_MODEL_NAME)
221
+ torch.onnx.export(
222
+ model,
223
+ args=text_inputs_positional,
224
+ f=TEXT_MODEL_OUTPUT_PATH,
225
+ export_params=True,
226
+ opset_version=14,
227
+ do_constant_folding=True,
228
+ input_names=list(text_inputs.keys()),
229
+ output_names=["logits"]
230
+ + [f"present.{i}.{key}" for i in range(num_layers) for key in ["key", "value"]],
231
+ dynamic_axes={
232
+ "inputs_embeds": {0: "batch_size", 1: "sequence_length"},
233
+ "attention_mask": {0: "batch_size", 1: "sequence_length"},
234
+ "position_ids": {1: "batch_size", 2: "sequence_length"},
235
+ **{
236
+ f"past_key_values.{i}.{key}": {0: "batch_size", 2: "past_sequence_length"}
237
+ for i in range(num_layers)
238
+ for key in ["key", "value"]
239
+ },
240
+ "logits": {0: "batch_size", 1: "sequence_length"},
241
+ **{
242
+ f"present.{i}.{key}": {0: "batch_size", 2: "past_sequence_length + 1"}
243
+ for i in range(num_layers)
244
+ for key in ["key", "value"]
245
+ },
246
+ },
247
+ )
248
+
249
+ ## Vision model
250
+ vision_inputs = dict(
251
+ pixel_values=pixel_values,
252
+ grid_thw=grid_thw,
253
+ )
254
+ vision_inputs_positional = tuple(vision_inputs.values())
255
+ vision_outputs = model.visual.forward(*vision_inputs_positional) # Test forward pass
256
+ VISION_ENCODER_OUTPUT_PATH = os.path.join(TEMP_MODEL_OUTPUT_FOLDER, VISION_MODEL_NAME)
257
+ torch.onnx.export(
258
+ model.visual,
259
+ args=vision_inputs_positional,
260
+ f=VISION_ENCODER_OUTPUT_PATH,
261
+ export_params=True,
262
+ opset_version=14,
263
+ do_constant_folding=True,
264
+ input_names=list(vision_inputs.keys()),
265
+ output_names=["image_features"],
266
+ dynamic_axes={
267
+ "pixel_values": {
268
+ 0: "batch_size * grid_t * grid_h * grid_w",
269
+ 1: "channel * temporal_patch_size * patch_size * patch_size",
270
+ },
271
+ "grid_thw": {0: "batch_size"},
272
+ "image_features": {0: "batch_size * grid_t * grid_h * grid_w"},
273
+ },
274
+ )
275
+
276
+
277
+ # Post-processing
278
+ import onnx
279
+ import onnxslim
280
+ from optimum.onnx.graph_transformations import check_and_save_model
281
+
282
+ os.makedirs(FINAL_MODEL_OUTPUT_FOLDER, exist_ok=True)
283
+ for name in (EMBEDDING_MODEL_NAME, TEXT_MODEL_NAME, VISION_MODEL_NAME):
284
+ temp_model_path = os.path.join(TEMP_MODEL_OUTPUT_FOLDER, name)
285
+
286
+ ## Shape inference (especially needed by the vision encoder)
287
+ onnx.shape_inference.infer_shapes_path(temp_model_path, check_type=True, strict_mode=True)
288
+
289
+ ## Attempt to optimize the model with onnxslim
290
+ try:
291
+ model = onnxslim.slim(temp_model_path)
292
+ except Exception as e:
293
+ print(f"Failed to slim {model}: {e}")
294
+ model = onnx.load(temp_model_path)
295
+
296
+ ## Save model
297
+ final_model_path = os.path.join(FINAL_MODEL_OUTPUT_FOLDER, name)
298
+ check_and_save_model(model, final_model_path)
299
+
300
+ ## Cleanup
301
+ import shutil
302
+ shutil.rmtree(TEMP_MODEL_OUTPUT_FOLDER)
303
+ ```
onnx-community/Qwen2-VL-2B-Instruct/added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|box_end|>": 151649,
3
+ "<|box_start|>": 151648,
4
+ "<|endoftext|>": 151643,
5
+ "<|im_end|>": 151645,
6
+ "<|im_start|>": 151644,
7
+ "<|image_pad|>": 151655,
8
+ "<|object_ref_end|>": 151647,
9
+ "<|object_ref_start|>": 151646,
10
+ "<|quad_end|>": 151651,
11
+ "<|quad_start|>": 151650,
12
+ "<|video_pad|>": 151656,
13
+ "<|vision_end|>": 151653,
14
+ "<|vision_pad|>": 151654,
15
+ "<|vision_start|>": 151652
16
+ }
onnx-community/Qwen2-VL-2B-Instruct/chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
+ }
onnx-community/Qwen2-VL-2B-Instruct/config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "_name_or_path": "Qwen/Qwen2-VL-2B-Instruct",
4
+ "architectures": [
5
+ "Qwen2VLForConditionalGeneration"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 151643,
9
+ "eos_token_id": 151645,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 1536,
12
+ "image_token_id": 151655,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 8960,
15
+ "max_position_embeddings": 32768,
16
+ "max_window_layers": 28,
17
+ "model_type": "qwen2_vl",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 28,
20
+ "num_key_value_heads": 2,
21
+ "rms_norm_eps": 1e-06,
22
+ "rope_scaling": {
23
+ "mrope_section": [
24
+ 16,
25
+ 24,
26
+ 24
27
+ ],
28
+ "rope_type": "default",
29
+ "type": "default"
30
+ },
31
+ "rope_theta": 1000000.0,
32
+ "sliding_window": 32768,
33
+ "tie_word_embeddings": true,
34
+ "torch_dtype": "bfloat16",
35
+ "transformers_version": "4.46.1",
36
+ "transformers.js_config": {
37
+ "dtype": {
38
+ "vision_encoder": "q8",
39
+ "embed_tokens": "fp16",
40
+ "decoder_model_merged": "q4"
41
+ }
42
+ },
43
+ "use_cache": true,
44
+ "use_sliding_window": false,
45
+ "video_token_id": 151656,
46
+ "vision_config": {
47
+ "hidden_size": 1536,
48
+ "in_chans": 3,
49
+ "model_type": "qwen2_vl",
50
+ "spatial_patch_size": 14
51
+ },
52
+ "vision_end_token_id": 151653,
53
+ "vision_start_token_id": 151652,
54
+ "vision_token_id": 151654,
55
+ "vocab_size": 151936
56
+ }
onnx-community/Qwen2-VL-2B-Instruct/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.01,
10
+ "top_k": 1,
11
+ "top_p": 0.001,
12
+ "transformers_version": "4.46.1"
13
+ }
onnx-community/Qwen2-VL-2B-Instruct/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfb545b801fe12aadf63e6fea2672118953a9ea4a6f93f3f54ef5a5abf640536
3
+ size 813442
onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6192eca60b017bd1065578fcd5ef536557a2ae197196101bbd4844dd55b551c
3
+ size 6174857216
onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged_bnb4.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f2cf55550007bed1c5453270ea5fc236bddbbc22ca7ba1e7772b514372d508b
3
+ size 869655116
onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged_fp16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63a6d8aaadbd997dd1a3bcb047814e2e11106ace5bd05463e34341e18625261e
3
+ size 853568
onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged_fp16.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dea38e3e9305ad4b63db905f2d55b64fd201a8178a27495d406188a6c437139
3
+ size 3087399936
onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged_int8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c747059a7294565f59dc6c6852af398477c53eaabf8c605a313d21e71f90cdf
3
+ size 1545150393
onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged_q4.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5492bb8064c31f61c48106b3cb140f1a81f9123205f52234329e1119f348eae
3
+ size 966126572
onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged_q4f16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cbf3e285eb58d975fef96da82faee0277416b5c7c3342718f0fcc0330b27033
3
+ size 869378228
onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged_quantized.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c747059a7294565f59dc6c6852af398477c53eaabf8c605a313d21e71f90cdf
3
+ size 1545150393
onnx-community/Qwen2-VL-2B-Instruct/onnx/decoder_model_merged_uint8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:853451575e3be3bfe1a0506e6b2b858853f1ab1d974bed48d11fc7868b06138a
3
+ size 1545150497
onnx-community/Qwen2-VL-2B-Instruct/onnx/embed_tokens.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:152d4247d40033d1b6f23ba4b25674850fbaf1b10d259f74da69b31944114cd6
3
+ size 933495086
onnx-community/Qwen2-VL-2B-Instruct/onnx/embed_tokens_bnb4.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88541f1b9947f617d4c1d5cea78ab8cb26574f74db5a7d64d7a9db22099aaa30
3
+ size 933495105
onnx-community/Qwen2-VL-2B-Instruct/onnx/embed_tokens_fp16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a717f0d2010b07f99b1982b5407f4c55143ca9ec45cb574b3ba272211024c661
3
+ size 466747781
onnx-community/Qwen2-VL-2B-Instruct/onnx/embed_tokens_int8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b0c7a8061d5a116be58f9c58637a85e0b87795426dec299c66af21a9ff77a6f
3
+ size 233374240
onnx-community/Qwen2-VL-2B-Instruct/onnx/embed_tokens_q4.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88541f1b9947f617d4c1d5cea78ab8cb26574f74db5a7d64d7a9db22099aaa30
3
+ size 933495105
onnx-community/Qwen2-VL-2B-Instruct/onnx/embed_tokens_q4f16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1697b2fd5563f22684826e9d6c5c24f2a8da74caa44b123b51f7eab10af1a851
3
+ size 466747800
onnx-community/Qwen2-VL-2B-Instruct/onnx/embed_tokens_quantized.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b0c7a8061d5a116be58f9c58637a85e0b87795426dec299c66af21a9ff77a6f
3
+ size 233374240
onnx-community/Qwen2-VL-2B-Instruct/onnx/embed_tokens_uint8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b0c7a8061d5a116be58f9c58637a85e0b87795426dec299c66af21a9ff77a6f
3
+ size 233374240
onnx-community/Qwen2-VL-2B-Instruct/onnx/vision_encoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b40135a66ba4b497b7b41252cc48abcfce280897d36545ac8c1e843f9d41f85a
3
+ size 1683152