add AIBOM
#77
by
sabato-nocera
- opened
openbmb_MiniCPM-Llama3-V-2_5.json
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bomFormat": "CycloneDX",
|
3 |
+
"specVersion": "1.6",
|
4 |
+
"serialNumber": "urn:uuid:1d723570-c4c0-4b5d-b1a1-aac2d99d4b2d",
|
5 |
+
"version": 1,
|
6 |
+
"metadata": {
|
7 |
+
"timestamp": "2025-06-05T09:41:47.231012+00:00",
|
8 |
+
"component": {
|
9 |
+
"type": "machine-learning-model",
|
10 |
+
"bom-ref": "openbmb/MiniCPM-Llama3-V-2_5-0f9a184d-7b24-5845-b52e-cbdc16daa953",
|
11 |
+
"name": "openbmb/MiniCPM-Llama3-V-2_5",
|
12 |
+
"externalReferences": [
|
13 |
+
{
|
14 |
+
"url": "https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5",
|
15 |
+
"type": "documentation"
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"modelCard": {
|
19 |
+
"modelParameters": {
|
20 |
+
"task": "image-text-to-text",
|
21 |
+
"architectureFamily": "minicpmv",
|
22 |
+
"modelArchitecture": "MiniCPMV",
|
23 |
+
"datasets": [
|
24 |
+
{
|
25 |
+
"ref": "openbmb/RLAIF-V-Dataset-d1ff380b-52a0-586f-946e-773a0ef8556f"
|
26 |
+
}
|
27 |
+
]
|
28 |
+
},
|
29 |
+
"properties": [
|
30 |
+
{
|
31 |
+
"name": "library_name",
|
32 |
+
"value": "transformers"
|
33 |
+
}
|
34 |
+
]
|
35 |
+
},
|
36 |
+
"authors": [
|
37 |
+
{
|
38 |
+
"name": "openbmb"
|
39 |
+
}
|
40 |
+
],
|
41 |
+
"description": "**MiniCPM-Llama3-V 2.5** is the latest model in the MiniCPM-V series. The model is built on SigLip-400M and Llama3-8B-Instruct with a total of 8B parameters. It exhibits a significant performance improvement over MiniCPM-V 2.0. Notable features of MiniCPM-Llama3-V 2.5 include:- \ud83d\udd25 **Leading Performance.**MiniCPM-Llama3-V 2.5 has achieved an average score of 65.1 on OpenCompass, a comprehensive evaluation over 11 popular benchmarks. **With only 8B parameters, it surpasses widely used proprietary models like GPT-4V-1106, Gemini Pro, Claude 3 and Qwen-VL-Max** and greatly outperforms other Llama 3-based MLLMs.- \ud83d\udcaa **Strong OCR Capabilities.**MiniCPM-Llama3-V 2.5 can process images with any aspect ratio and up to 1.8 million pixels (e.g., 1344x1344), achieving an **700+ score on OCRBench, surpassing proprietary models such as GPT-4o, GPT-4V-0409, Qwen-VL-Max and Gemini Pro**. Based on recent user feedback, MiniCPM-Llama3-V 2.5 has now enhanced full-text OCR extraction, table-to-markdown conversion, and other high-utility capabilities, and has further strengthened its instruction-following and complex reasoning abilities, enhancing multimodal interaction experiences.- \ud83c\udfc6 **Trustworthy Behavior.**Leveraging the latest [RLAIF-V](https://github.com/RLHF-V/RLAIF-V/) method (the newest technology in the [RLHF-V](https://github.com/RLHF-V) [CVPR'24] series), MiniCPM-Llama3-V 2.5 exhibits more trustworthy behavior. It achieves **10.3%** hallucination rate on Object HalBench, lower than GPT-4V-1106 (13.6%), achieving the best-level performance within the open-source community. [Data released](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset).- \ud83c\udf0f **Multilingual Support.**Thanks to the strong multilingual capabilities of Llama 3 and the cross-lingual generalization technique from [VisCPM](https://github.com/OpenBMB/VisCPM), MiniCPM-Llama3-V 2.5 extends its bilingual (Chinese-English) multimodal capabilities to **over 30 languages including German, French, Spanish, Italian, Korean, Japanese etc.** [All Supported Languages](./assets/minicpm-llama-v-2-5_languages.md).- \ud83d\ude80 **Efficient Deployment.**MiniCPM-Llama3-V 2.5 systematically employs **model quantization, CPU optimizations, NPU optimizations and compilation optimizations**, achieving high-efficiency deployment on edge devices. For mobile phones with Qualcomm chips, we have integrated the NPU acceleration framework QNN into llama.cpp for the first time. After systematic optimization, MiniCPM-Llama3-V 2.5 has realized a **150-fold acceleration in multimodal large model end-side image encoding** and a **3-fold increase in language decoding speed**.- \ud83d\udcab **Easy Usage.**MiniCPM-Llama3-V 2.5 can be easily used in various ways: (1) [llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpm-v2.5/examples/minicpmv/README.md) and [ollama](https://github.com/OpenBMB/ollama/tree/minicpm-v2.5/examples/minicpm-v2.5) support for efficient CPU inference on local devices, (2) [GGUF](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) format quantized models in 16 sizes, (3) efficient [LoRA](https://github.com/OpenBMB/MiniCPM-V/tree/main/finetune#lora-finetuning) fine-tuning with only 2 V100 GPUs, (4) [streaming output](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5#usage), (5) quick local WebUI demo setup with [Gradio](https://github.com/OpenBMB/MiniCPM-V/blob/main/web_demo_2.5.py) and [Streamlit](https://github.com/OpenBMB/MiniCPM-V/blob/main/web_demo_streamlit-2_5.py), and (6) interactive demos on [HuggingFace Spaces](https://huggingface.co/spaces/openbmb/MiniCPM-Llama3-V-2_5).",
|
42 |
+
"tags": [
|
43 |
+
"transformers",
|
44 |
+
"safetensors",
|
45 |
+
"minicpmv",
|
46 |
+
"feature-extraction",
|
47 |
+
"minicpm-v",
|
48 |
+
"vision",
|
49 |
+
"ocr",
|
50 |
+
"custom_code",
|
51 |
+
"image-text-to-text",
|
52 |
+
"conversational",
|
53 |
+
"multilingual",
|
54 |
+
"dataset:openbmb/RLAIF-V-Dataset",
|
55 |
+
"region:us"
|
56 |
+
]
|
57 |
+
}
|
58 |
+
},
|
59 |
+
"components": [
|
60 |
+
{
|
61 |
+
"type": "data",
|
62 |
+
"bom-ref": "openbmb/RLAIF-V-Dataset-d1ff380b-52a0-586f-946e-773a0ef8556f",
|
63 |
+
"name": "openbmb/RLAIF-V-Dataset",
|
64 |
+
"data": [
|
65 |
+
{
|
66 |
+
"type": "dataset",
|
67 |
+
"bom-ref": "openbmb/RLAIF-V-Dataset-d1ff380b-52a0-586f-946e-773a0ef8556f",
|
68 |
+
"name": "openbmb/RLAIF-V-Dataset",
|
69 |
+
"contents": {
|
70 |
+
"url": "https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset",
|
71 |
+
"properties": [
|
72 |
+
{
|
73 |
+
"name": "task_categories",
|
74 |
+
"value": "visual-question-answering"
|
75 |
+
},
|
76 |
+
{
|
77 |
+
"name": "language",
|
78 |
+
"value": "en"
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"name": "size_categories",
|
82 |
+
"value": "10K<n<100K"
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"name": "pretty_name",
|
86 |
+
"value": "RLAIF-V-Dataset"
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"name": "license",
|
90 |
+
"value": "cc-by-nc-4.0"
|
91 |
+
}
|
92 |
+
]
|
93 |
+
},
|
94 |
+
"governance": {
|
95 |
+
"owners": [
|
96 |
+
{
|
97 |
+
"organization": {
|
98 |
+
"name": "openbmb",
|
99 |
+
"url": "https://huggingface.co/openbmb"
|
100 |
+
}
|
101 |
+
}
|
102 |
+
]
|
103 |
+
},
|
104 |
+
"description": "\n\t\n\t\t\n\t\tDataset Card for RLAIF-V-Dataset\n\t\n\nGitHub | Paper\n\n\t\n\t\t\n\t\tNews:\n\t\n\n\n[2025.03.01] \ud83c\udf89 RLAIF-V is accepted by CVPR 2025! You can access the lastest version of the paper at here.\n[2024.05.28] \ud83d\udcc3 Our paper is accesible at arxiv now!\n[2024.05.20] \ud83d\udd25 Our data is used in MiniCPM-Llama3-V 2.5, which represents the first end-side MLLM achieving GPT-4V level performance!\n\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Summary\n\t\n\nRLAIF-V-Dataset is a large-scale multimodal feedback dataset. The dataset provides\u2026 See the full description on the dataset page: https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset."
|
105 |
+
}
|
106 |
+
]
|
107 |
+
}
|
108 |
+
]
|
109 |
+
}
|