LandyGuo
commited on
Commit
·
dd5f831
1
Parent(s):
81a8221
update v20250516 ckpts
Browse files- .gitattributes +3 -0
- README.md +326 -125
- audio_detokenizer/cli/model.py +6 -6
- bailingmm_utils.py +78 -2
- data/matcha_tts-0.0.5.1-cp310-cp310-linux_x86_64.whl +3 -0
- data/spks/luna.pt +2 -2
- figures/ant-bailing.png +3 -0
- figures/cases/animal.png +3 -0
- figures/cases/audioqa_audio.wav +0 -0
- figures/cases/audioqa_video.mp4 +3 -0
- figures/cases/demo_430v2.mp4 +3 -0
- figures/cases/document.png +3 -0
- figures/cases/document_parse.png +3 -0
- figures/cases/grounding1.png +3 -0
- figures/cases/grounding1_vis.png +3 -0
- figures/cases/grounding2.png +3 -0
- figures/cases/grounding2_vis.png +3 -0
- figures/cases/gui.png +3 -0
- figures/cases/knowledge1.png +3 -0
- figures/cases/knowledge2.png +3 -0
- figures/cases/ocr.png +3 -0
- figures/cases/plant.png +3 -0
- figures/cases/reasoning.png +3 -0
- figures/cases/s2s.mp4 +3 -0
- figures/ming.png +3 -0
- figures/performance.png +3 -0
- figures/unified_samples.png +3 -0
- model-00001-of-00016.safetensors +1 -1
- model-00002-of-00016.safetensors +1 -1
- model-00015-of-00016.safetensors +1 -1
- model-00016-of-00016.safetensors +1 -1
- modeling_bailing_talker.py +39 -12
- modeling_bailingmm.py +30 -5
- requirements.txt +20 -0
- test_audio_tasks.py +3 -3
.gitattributes
CHANGED
@@ -38,5 +38,8 @@ data/wavs/BAC009S0915W0292.wav filter=lfs diff=lfs merge=lfs -text
|
|
38 |
out.wav filter=lfs diff=lfs merge=lfs -text
|
39 |
talker/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
40 |
data/openai_whisper-20240930-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
41 |
|
42 |
|
|
|
38 |
out.wav filter=lfs diff=lfs merge=lfs -text
|
39 |
talker/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
40 |
data/openai_whisper-20240930-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
41 |
+
data/matcha_tts-0.0.5.1-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
|
42 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
43 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
44 |
|
45 |
|
README.md
CHANGED
@@ -1,19 +1,179 @@
|
|
1 |
-
|
2 |
-
license: mit
|
3 |
-
base_model:
|
4 |
-
- inclusionAI/Ling-lite
|
5 |
-
---
|
6 |
-
# Ming-Lite-Omni-Preview
|
7 |
-
### Model Description
|
8 |
|
9 |
-
|
10 |
-
|
|
|
11 |
|
12 |
-
|
13 |
|
14 |
-
- **Video understanding**: Supports KV-Cache dynamic compression of visual tokens. While supporting the ability to understand long videos of hours, it can also provide more detailed understanding of short videos of a few seconds.
|
15 |
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
|
19 |
## Model Downloads
|
@@ -22,24 +182,41 @@ You can download the model from both Huggingface and ModelScope.
|
|
22 |
|
23 |
<div align="center">
|
24 |
|
25 |
-
| **Model**
|
26 |
-
|
27 |
-
| Ming-Lite-Omni
|
28 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
|
31 |
-
## Quickstart
|
32 |
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
```python
|
36 |
import os
|
37 |
-
|
|
|
38 |
from modeling_bailingmm import BailingMMNativeForConditionalGeneration
|
39 |
|
40 |
# build model
|
41 |
model = BailingMMNativeForConditionalGeneration.from_pretrained(
|
42 |
-
"inclusionAI/Ming-Lite-Omni
|
43 |
torch_dtype=torch.bfloat16,
|
44 |
low_cpu_mem_usage=True
|
45 |
).to("cuda")
|
@@ -47,7 +224,7 @@ model = BailingMMNativeForConditionalGeneration.from_pretrained(
|
|
47 |
assets_path = YOUR_ASSETS_PATH
|
48 |
|
49 |
# build processor
|
50 |
-
processor = AutoProcessor.from_pretrained("inclusionAI/Ming-Lite-Omni
|
51 |
```
|
52 |
|
53 |
```python
|
@@ -166,11 +343,13 @@ for k in inputs.keys():
|
|
166 |
inputs[k] = inputs[k].to(dtype=torch.bfloat16)
|
167 |
|
168 |
# call generate
|
|
|
169 |
generated_ids = model.generate(
|
170 |
**inputs,
|
171 |
max_new_tokens=512,
|
172 |
-
use_cache=
|
173 |
eos_token_id=processor.gen_terminator,
|
|
|
174 |
)
|
175 |
generated_ids_trimmed = [
|
176 |
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
@@ -181,6 +360,7 @@ output_text = processor.batch_decode(
|
|
181 |
print(output_text)
|
182 |
```
|
183 |
|
|
|
184 |
|
185 |
```python
|
186 |
# ASR
|
@@ -193,8 +373,25 @@ messages = [
|
|
193 |
],
|
194 |
},
|
195 |
]
|
196 |
-
|
197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
```
|
199 |
|
200 |
```python
|
@@ -203,126 +400,130 @@ messages = [
|
|
203 |
{
|
204 |
"role": "HUMAN",
|
205 |
"content": [
|
206 |
-
{"type": "audio", "audio": 'data/wavs/
|
207 |
],
|
208 |
},
|
209 |
]
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
## Evaluation
|
218 |
-
|
219 |
-
### Image benchmark
|
220 |
-
<div align="center">
|
221 |
-
|
222 |
-
| Benchmarks | Ming-Lite-Omni-Preview | Qwen2.5-VL-7B-Instruct | InternVL2.5-8B-MPO |
|
223 |
-
|:------------------|:----------------------:|:---------------------------:|:------------------:|
|
224 |
-
| AI2D | 83.84 | 83.9 | <b>84.5</b> |
|
225 |
-
| HallusionBench | <b>54.68</b> | 51.9 | 51.7 |
|
226 |
-
| MMBench_TEST_V11 | 79.63 | <b>84.3</b> | 82.0 |
|
227 |
-
| MMMU | 57.0 | <b>58.6</b> | 54.8 |
|
228 |
-
| MMStar | 62.0 | 63.9 | <b>65.2</b> |
|
229 |
-
| MMVet | <b>73.6</b> | 67.1 | 68.1 |
|
230 |
-
| MathVista | <b>69.0</b> | 68.2 | 67.9 |
|
231 |
-
| OCRBench | 87.9 | 86.4 | <b>88.2</b> |
|
232 |
-
| Average | <b>70.96</b> | 70.5 | 70.3 |
|
233 |
-
|
234 |
-
</div>
|
235 |
-
|
236 |
-
|
237 |
-
#### Object Recognition
|
238 |
-
<div align="center">
|
239 |
-
|
240 |
-
| Object Recognition | Ming-Lite-Omni-Preview | Qwen2.5-VL-7B | InternVL-2.5-8B |
|
241 |
-
|:----------------------------|:----------------------:|:-------------:|:---------------:|
|
242 |
-
| Plants | 52.1 | <b>55.3</b> | 32.8 |
|
243 |
-
| Animals | 52.6 | <b>54.8</b> | 36.5 |
|
244 |
-
| Home appliances & furniture | 93.5 | <b>97.4</b> | 90.9 |
|
245 |
-
| Personal Electronics | <b>96.1</b> | 95.1 | 93.2 |
|
246 |
-
| Food & Ingredients | 57.5 | <b>60.0</b> | 48.7 |
|
247 |
-
| Tableware | <b>96.6 | 94.9 | 88.1 |
|
248 |
-
| Vehicles | 31.9 | <b>40.9</b> | 31.9 |
|
249 |
-
| Average | 68.6 | <b>71.2</b> | 60.3 |
|
250 |
-
|
251 |
-
</div>
|
252 |
-
|
253 |
-
|
254 |
-
### Video benchmark
|
255 |
-
|
256 |
-
<div align="center">
|
257 |
-
|
258 |
-
| Benchmarks | Ming-Lite-Omni-Preview | Qwen2.5VL-7B |
|
259 |
-
|:-------------------|:------------------------:|:----------------:|
|
260 |
-
| VideoMME wo/w sub. | 63.9/67.6 | <b>65.1/71.6</b> |
|
261 |
-
| MVBench | 67.0 | <b>72.0</b> |
|
262 |
-
| Video-MMMU | 45.4 | <b>47.44</b> |
|
263 |
-
| LongVideoBench | 53.7 | <b>60.0</b> |
|
264 |
-
</div>
|
265 |
-
|
266 |
|
267 |
-
|
268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
|
270 |
-
|
|
|
|
|
|
|
271 |
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
| GLM-4-Voice | 4.06 | 3.48 | 43.31 | 40.11 | 52.97 | 24.91 | 88.08 |
|
277 |
-
| Kimi-Audio | 4.46 | <b>3.97</b> | <b>63.12</b> | 62.17 | <b>83.52</b> | <b>61.10</b> | <b>100.00</b> |
|
278 |
-
| Qwen2.5-Omni | <b>4.49</b> | 3.93 | 55.71 | <b>61.32</b> | 81.10 | 52.87 | 99.42 |
|
279 |
-
| Ming-Lite-Omni-Preview | 4.25 | 3.88 | 58.95 | 46.06 | 60.00 | 46.71 | 96.53 |
|
280 |
-
</div>
|
281 |
|
282 |
-
|
|
|
283 |
|
284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
|
286 |
-
|
287 |
-
|
288 |
-
| Whisper Large-v3 | 5.14 | 4.76 | 9.68 | 18.54 | 1.9 | 3.65 |
|
289 |
-
| Qwen2-Audio | 1.53 | 3.06 | 7.72 | 8.4 | <b>1.6</b> | 3.6 |
|
290 |
-
| GLM-4-voice Base | 2.46 | - | - | - | 2.82 | 7.66 |
|
291 |
-
| Baichuan-Omni-1.5 | - | - | 6.9 | 8.4 | - | - |
|
292 |
-
| Qwen2.5-Omni | <b>1.18</b> | <b>2.36</b> | <b>5.9</b> | 7.7 | 1.8 | <b>3.4</b> |
|
293 |
-
| Ming-Lite-Omni-Preview | 1.62 | 2.82 | 6.23 | <b>6.9</b> | 2.34 | 5.74 |
|
294 |
|
295 |
-
|
296 |
|
|
|
297 |
|
298 |
-
|
299 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
300 |
|
301 |
-
|
302 |
-
|:--------------------------|:---------------:|:------------------------:|:----------------------:|
|
303 |
-
| GPT-4o | <b>36.05</b> | - | - |
|
304 |
-
| PaLI-X | 22.06 | 23.5 | 20.8 |
|
305 |
-
| Qwen2.5-vl-32B | 19.35 | 20.55 | 18.28 |
|
306 |
-
| Ming-Lite-Omni-Preview | 27.3 | 28.9 | 25.9 |
|
307 |
-
</div>
|
308 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
309 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
310 |
|
311 |
-
### OCR&GUI
|
312 |
-
<div align="center">
|
313 |
|
314 |
-
|
315 |
-
|:-------------------|:----------------------:|:----------------------:|
|
316 |
-
| ChartQA_TEST | 85.2 | <b>87.3</b> |
|
317 |
-
| DocVQA_TEST | 93.2 | <b>95.7</b> |
|
318 |
-
| OCRBenchV2_en/zh | 52.2/51.6 | <b>56.3/57.2</b> |
|
319 |
-
| OmniDocBench↓ | 34.7/34.5 | <b>30.8/39.8</b> |
|
320 |
-
| TextVQA_VAL | 82.36 | <b>84.9</b> |
|
321 |
-
| ScreenSpot | 79.3 | <b>84.7</b> |
|
322 |
-
</div>
|
323 |
|
|
|
324 |
|
325 |
|
326 |
-
## Model Sources
|
327 |
-
- **Github Repository:** https://github.com/inclusionAI/Ming
|
328 |
|
|
|
1 |
+
# Ming-Lite-Omni
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
<p align="center">
|
4 |
+
<img src="./figures/ant-bailing.png" width="100"/>
|
5 |
+
<p>
|
6 |
|
7 |
+
<p align="center">📑 <a href="https://github.com/inclusionAI/Ming">Technical Report</a>|📖<a href="https://lucaria-academy.github.io/Ming-Omni/">Project Page</a> |🤗 <a href="https://huggingface.co/inclusionAI/Ming-Lite-Omni">Hugging Face</a>| 🤖 <a href="https://www.modelscope.cn/models/inclusionAI/Ming-Lite-Omni">ModelScope</a>|
|
8 |
|
|
|
9 |
|
10 |
+
|
11 |
+
## Introduction
|
12 |
+
|
13 |
+
Ming-lite-omni, a light version of Ming-omni, which is derived from [Ling-lite](https://github.com/inclusionAI/Ling) and features 2.8 billion activated parameter. Ming-lite-omni is a unified multimodal model capable of processing images, text, audio, and video, while demonstrating strong proficiency in both speech and image generation. Ming-lite-omni employs dedicated encoders to extract tokens from different modalities, which are then processed by Ling, an MoE architecture equipped with newly proposed modality-specific routers. This design enables a single model to efficiently process and fuse multimodal inputs within a unified framework, thereby facilitating diverse tasks without requiring separate models, task-specific fine-tuning, or structural redesign. Importantly, Ming-lite-omni extends beyond conventional multimodal models by supporting audio and image generation. This is achieved through the integration of an advanced audio decoder for natural-sounding speech and Ming-Lite-Uni for high-quality image generation, which also allow the model to engage in context-aware chatting, perform text-to-speech conversion, and conduct versatile image editing. Our experimental results showcase Ming-lite-omni offers a powerful solution for unified perception and generation across all modalities.
|
14 |
+
Notably, Ming-lite-omni is the first open-source model we are aware of to match GPT-4o in modality support, and we release all code and model weights to encourage further research and development in the community.
|
15 |
+
|
16 |
+
|
17 |
+
<p align="center">
|
18 |
+
<img src="./figures/ming.png" width="800"/>
|
19 |
+
<p>
|
20 |
+
|
21 |
+
## 📌 Updates
|
22 |
+
|
23 |
+
[//]: # (* [2025.05.28] 🔥 Our [Technical Report](https://arxiv.org/pdf/2505.02471) is in public on arxiv.)
|
24 |
+
* [2025.05.28] 🔥 The official version of Ming-lite-omni is released, with better performance and image generation support.
|
25 |
+
* [2025.05.04] 🔥 We release the test version of Ming-lite-omni:[Ming-lite-omni-Preview](https://github.com/inclusionAI/Ming/tree/Ming-Lite-Omni-Preview).
|
26 |
+
|
27 |
+
|
28 |
+
## Key Features
|
29 |
+
|
30 |
+
- **Unified Omni-Modality Perception**: Ming-lite-omni, built on [Ling](https://github.com/inclusionAI/Ling), an MoE architecture LLM, resolves task conflicts and ensures coherent integration of tokens from different modalities through modality-specific routers.
|
31 |
+
|
32 |
+
- **Unified Perception and Generation**: Ming-lite-omni achieves unified understanding and generation, enabling the model to interpret multimodal instructions and user intent during generation, which helps enhance generation quality and improves usability across multiple tasks.
|
33 |
+
|
34 |
+
- **Innovative Generation Capabilities**: Ming-lite-omni can perceive all modalities and generate high-quality text, real-time speech, and vivid images simultaneously, delivering exceptional cross-modal performance across diverse tasks including image perception, audio-visual interaction, and image generation.
|
35 |
+
|
36 |
+
|
37 |
+
## Evaluation
|
38 |
+
Ming-lite-omni delivers exceptional cross-modal performance, as validated across image perception, audio-visual interaction, and image generation tasks. Specifically, in the image perception task, Ming-lite-omni attained performance comparable to that of Qwen2.5-VL-7B by activating only 2.8B parameters. It delivers superior performance in end-to-end speech understanding and instruction following, surpassing Qwen2.5-Omni and Kimi-Audio. It also supports native-resolution image generation, editing, and style transfer, achieving a GenEval score of 0.64, outperforming mainstream models such as SDXL. In terms of FID, Ming-lite-omni reaches 4.85, setting a new SOTA across existing methods.
|
39 |
+
<p align="center">
|
40 |
+
<img src="./figures/performance.png" width="800"/>
|
41 |
+
<p>
|
42 |
+
|
43 |
+
|
44 |
+
### Image benchmark
|
45 |
+
<div align="center">
|
46 |
+
|
47 |
+
| Benchmarks | Ming-lite-omni | Qwen2.5-VL-7B-Instruct | InternVL2.5-8B-MPO |
|
48 |
+
|:------------------|:--------------:|:----------------------------:|:------------------:|
|
49 |
+
| AI2D | 83.1 | 84.4 | <b>84.5</b> |
|
50 |
+
| HallusionBench | <b>55.0</b> | 55.8 | 51.7 |
|
51 |
+
| MMBench_TEST_V11 | 80.8 | <b>82.8</b> | 82.0 |
|
52 |
+
| MMMU | 56.3 | <b>56.6</b> | 54.8 |
|
53 |
+
| MMStar | 64.7 | 65.3 | <b>65.2</b> |
|
54 |
+
| MMVet | 71.3 | 71.6 | 68.1 |
|
55 |
+
| MathVista | <b>71.6</b> | 68.1 | 67.9 |
|
56 |
+
| OCRBench | <b>88.4</b> | 87.8 | 88.2 |
|
57 |
+
| Average | 71.4 | <b>71.5</b> | 70.3 |
|
58 |
+
|
59 |
+
</div>
|
60 |
+
|
61 |
+
|
62 |
+
#### Encyclopedia Benchmarks
|
63 |
+
<div align="center">
|
64 |
+
|
65 |
+
| Object Recognition | Ming-lite-omni | Qwen2.5-VL-7B-Instruct |
|
66 |
+
|:---------------------|:--------------:|:------------------------:|
|
67 |
+
| Plants | **54.96** | 47.8 |
|
68 |
+
| Animals | **56.7** | 50.85 |
|
69 |
+
| Vehicles | 41.91 | **42.29** |
|
70 |
+
| Food & Ingredients | **62.28** | 54.09 |
|
71 |
+
| Dishes | **44.3** | 39.07 |
|
72 |
+
| General | 91.08 | **92.42** |
|
73 |
+
| Average | **58.54** | 54.43 |
|
74 |
+
|
75 |
+
</div>
|
76 |
+
|
77 |
+
### Video benchmark
|
78 |
+
|
79 |
+
<div align="center">
|
80 |
+
|
81 |
+
| Benchmarks | Ming-lite-omni | Qwen2.5VL-7B-Instruct |
|
82 |
+
|:------------------------|:--------------:|:---------------------:|
|
83 |
+
| VideoMME | 67.0 | <b>67.3</b> |
|
84 |
+
| MVBench | 67.7 | <b>67.4</b> |
|
85 |
+
| Video-MMMU | 46.3 | <b>47.4</b> |
|
86 |
+
| LongVideoBench | 56.6 | 54.7 |
|
87 |
+
| Average | <b>59.4</b> | 59.2 |
|
88 |
+
|
89 |
+
</div>
|
90 |
+
Note: All models are evaluated based on 128 uniformly sampled frames.
|
91 |
+
|
92 |
+
### Audio benchmark
|
93 |
+
#### SpeechQA
|
94 |
+
|
95 |
+
<div align="center">
|
96 |
+
|
97 |
+
| Model | Average | AlpacaEval | CommonEval | SD-QA | MMSU | OpenBookQA | IFEval | AdvBench |
|
98 |
+
|:-----------------|:-------------:|:-----------:|:-----------:|:------------:|:------------:|:------------:|:------------:|:-------------:|
|
99 |
+
| Qwen2-Audio-chat | 3.545 | 3.69 | 3.40 | 35.35 | 35.43 | 49.01 | 22.57 | 98.85 |
|
100 |
+
| Baichuan-Audio | 3.695 | 4.00 | 3.39 | 49.64 | 48.80 | 63.30 | 41.32 | 86.73 |
|
101 |
+
| GLM-4-Voice | 3.77 | 4.06 | 3.48 | 43.31 | 40.11 | 52.97 | 24.91 | 88.08 |
|
102 |
+
| Kimi-Audio | 4.215 | 4.46 | 3.97 | <b>63.12</b> | <b>62.17</b> | <b>83.52</b> | <b>61.10</b> | <b>100.00</b> |
|
103 |
+
| Qwen2.5-Omni | 4.21 | 4.49 | 3.93 | 55.71 | 61.32 | 81.10 | 52.87 | 99.42 |
|
104 |
+
| Ming-lite-omni | <b>4.34</b> | <b>4.63</b> | <b>4.06</b> | 58.84 | 47.53 | 61.98 | 58.36 | 99.04 |
|
105 |
+
</div>
|
106 |
+
|
107 |
+
#### ASR
|
108 |
+
|
109 |
+
<div align="center">
|
110 |
+
|
111 |
+
| Model | aishell1 | aishell2_android | aishell2_ios | cv15_zh | fleurs_zh | wenetspeech_meeting | wenetspeech_net | librispeech_test_clean | librispeech_test_other | multilingual_librispeech | cv15_en | fleurs_en | voxpopuli_v1.0_en |
|
112 |
+
|:--------------:|:--------:|:----------------:|:------------:|:--------:|:---------:|:-------------------:|:---------------:|:----------------------:|:----------------------:|:------------------------:|:--------:|:---------:|:--------------------:|
|
113 |
+
| Ming-lite-omni | 1.47 | **2.55** | **2.52** | 6.31 | 2.96 | 5.95 | 5.46 | 1.44 | 2.80 | **4.15** | **6.89** | **3.39** | **5.80** |
|
114 |
+
| Qwen2.-Omni | 1.18 | 2.75 | 2.63 | **5.20** | 3.00 | **5.90** | 7.70 | 1.80 | 3.40 | 7.56 | 7.60 | 4.10 | **5.80** |
|
115 |
+
| Qwen2-Audio | 1.53 | 2.92 | 2.92 | 6.90 | 7.50 | 7.16 | 8.42 | 1.60 | 3.60 | 5.40 | 8.60 | 6.90 | 6.84 |
|
116 |
+
| Kimi-Audio | **0.60** | 2.64 | 2.56 | 7.21 | **2.69** | 6.28 | **5.37** | **1.28** | **2.42** | 5.88 | 10.31 | 4.44 | 7.97 |
|
117 |
+
|
118 |
+
</div>
|
119 |
+
|
120 |
+
|
121 |
+
|
122 |
+
### Information-Seeking Benchmark
|
123 |
+
<div align="center">
|
124 |
+
|
125 |
+
| Model | InfoSeek_H-mean | InfoSeek_unseen_question | InfoSeek_unseen_entity |
|
126 |
+
|:---------------|:---------------:|:------------------------:|:----------------------:|
|
127 |
+
| GPT-4o | <b>36.05</b> | - | - |
|
128 |
+
| PaLI-X | 22.06 | 23.5 | 20.8 |
|
129 |
+
| Qwen2.5-vl-32B | 19.35 | 20.55 | 18.28 |
|
130 |
+
| Ming-lite-omni | 27.7 | **30.4** | **25.4** |
|
131 |
+
</div>
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
### OCR
|
136 |
+
<div align="center">
|
137 |
+
|
138 |
+
| Model | Ming-lite-omni | Qwen2.5-VL-7B-Instruct |
|
139 |
+
|:-------------------|:--------------:|:-----------------------:|
|
140 |
+
| ChartQA_TEST | 85.1 | <b>87.3</b> |
|
141 |
+
| DocVQA_TEST | 93 | <b>95.7</b> |
|
142 |
+
| OCRBenchV2_en/zh | 53.3/52 | <b>56.3/57.2</b> |
|
143 |
+
| OmniDocBench↓ | 34/<b>34.4</b> | <b>30.8</b>/39.8 |
|
144 |
+
| TextVQA_VAL | 82.8 | <b>84.9</b> |
|
145 |
+
</div>
|
146 |
+
|
147 |
+
### GUI
|
148 |
+
<div align="center">
|
149 |
+
|
150 |
+
| Model | Ming-lite-omni | InternVL3 8B | Qwen2.5-VL-7B-Instruct |
|
151 |
+
|:---------------------------|:--------------:|:------------:|:----------------------:|
|
152 |
+
| ScreenSpot | <b>82.1</b> | 79.5 | 78.9* |
|
153 |
+
| ScreenSpot-V2 | <b>84.1</b> | 81.4 | - |
|
154 |
+
| AITZ(EM) | <b>66.6</b> | - | 57.6* |
|
155 |
+
</div>
|
156 |
+
Note: * denotes the reproduced results.
|
157 |
+
|
158 |
+
|
159 |
+
|
160 |
+
### Unified Generation Benchmark
|
161 |
+
|
162 |
+
<div align="center">
|
163 |
+
|
164 |
+
| Model | single_object | two_object | counting | colors | position | color_attr | GENEVAL | DPGBench | FID↓ |
|
165 |
+
|:---------------|:-------------:|:----------:|:----------:|:--------:|:--------:|:----------:|:--------:|:---------:|:-------------:|
|
166 |
+
| Ming-lite-omni | **0.9875** | **0.7727** | **0.6812** | 0.7872 | 0.31 | 0.29 | **0.64** | 81.72 | **4.85** |
|
167 |
+
| Metaquery-XL | - | - | - | - | - | - | 0.61 | **82.05** | 6.02 |
|
168 |
+
| SDv2.1 | 0.98 | 0.51 | 0.44 | **0.85** | 0.07 | 0.17 | 0.50 | 68.09 | 26.96 |
|
169 |
+
| Emu3-Gen | 0.98 | 0.71 | 0.34 | 0.81 | 0.17 | 0.21 | 0.54 | 80.60 | - |
|
170 |
+
| SDXL | 0.98 | 0.74 | 0.39 | **0.85** | 0.15 | 0.23 | 0.55 | 74.65 | 8.76 |
|
171 |
+
| Janus | 0.97 | 0.68 | 0.30 | 0.84 | **0.46** | **0.42** | 0.61 | 79.68 | 10.10 |
|
172 |
+
| JanusFlow | - | - | - | - | - | - | 0.63 | 80.09 | 9.51 |
|
173 |
+
|
174 |
+
</div>
|
175 |
+
|
176 |
+
Please refer to our technical report for more comprehensive evaluation results.
|
177 |
|
178 |
|
179 |
## Model Downloads
|
|
|
182 |
|
183 |
<div align="center">
|
184 |
|
185 |
+
| **Model** | **Input modality** | **Oput modality** | **Download** |
|
186 |
+
|:---------------| :---------------------: | :---------------: |:----------------------------------------------------------------------------------------------------------------------------------------------------:|
|
187 |
+
| Ming-Lite-Omni | Image,text,viedio,audio | Image,text,audio | [🤗 HuggingFace](https://huggingface.co/inclusionAI/Ming-Lite-Omni) <br>[🤖 ModelScope](https://www.modelscope.cn/models/inclusionAI/Ming-Lite-Omni) |
|
188 |
</div>
|
189 |
+
If you're in mainland China, we strongly recommend you to download our model from 🤖 <a href="https://www.modelscope.cn/models/inclusionAI/Ming-Lite-Omni">ModelScope</a>.
|
190 |
+
|
191 |
+
|
192 |
+
## Use Cases
|
193 |
+
|
194 |
+
Additional demonstration cases are available on our project [page](https://lucaria-academy.github.io/Ming-Omni/).
|
195 |
+
|
196 |
|
197 |
|
|
|
198 |
|
199 |
+
## Example Usage
|
200 |
+
|
201 |
+
Please download our model following [Model Downloads](#model-downloads), then you can refer to the following codes to run Ming-lite-omni model.
|
202 |
+
|
203 |
+
Python environment dependency installation.
|
204 |
+
```shell
|
205 |
+
pip install -r requirements.txt
|
206 |
+
pip install data/matcha_tts-0.0.5.1-cp38-cp38-linux_x86_64.whl
|
207 |
+
pip install diffusers==0.33.0
|
208 |
+
pip install nvidia-cublas-cu12==12.4.5.8 # for H20
|
209 |
+
```
|
210 |
|
211 |
```python
|
212 |
import os
|
213 |
+
import torch
|
214 |
+
from transformers import AutoProcessor, GenerationConfig
|
215 |
from modeling_bailingmm import BailingMMNativeForConditionalGeneration
|
216 |
|
217 |
# build model
|
218 |
model = BailingMMNativeForConditionalGeneration.from_pretrained(
|
219 |
+
"inclusionAI/Ming-Lite-Omni",
|
220 |
torch_dtype=torch.bfloat16,
|
221 |
low_cpu_mem_usage=True
|
222 |
).to("cuda")
|
|
|
224 |
assets_path = YOUR_ASSETS_PATH
|
225 |
|
226 |
# build processor
|
227 |
+
processor = AutoProcessor.from_pretrained("inclusionAI/Ming-Lite-Omni", trust_remote_code=True)
|
228 |
```
|
229 |
|
230 |
```python
|
|
|
343 |
inputs[k] = inputs[k].to(dtype=torch.bfloat16)
|
344 |
|
345 |
# call generate
|
346 |
+
generation_config = GenerationConfig.from_dict({'no_repeat_ngram_size': 10})
|
347 |
generated_ids = model.generate(
|
348 |
**inputs,
|
349 |
max_new_tokens=512,
|
350 |
+
use_cache=True,
|
351 |
eos_token_id=processor.gen_terminator,
|
352 |
+
generation_config=generation_config,
|
353 |
)
|
354 |
generated_ids_trimmed = [
|
355 |
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
|
|
360 |
print(output_text)
|
361 |
```
|
362 |
|
363 |
+
### Audio tasks
|
364 |
|
365 |
```python
|
366 |
# ASR
|
|
|
373 |
],
|
374 |
},
|
375 |
]
|
376 |
+
# we use whisper encoder for ASR task, so need modify code above
|
377 |
+
inputs = processor(
|
378 |
+
text=[text],
|
379 |
+
images=image_inputs,
|
380 |
+
videos=video_inputs,
|
381 |
+
audios=audio_inputs,
|
382 |
+
return_tensors="pt",
|
383 |
+
audio_kwargs={'use_whisper_encoder': True}
|
384 |
+
)
|
385 |
+
|
386 |
+
outputs = model.generate(
|
387 |
+
**inputs,
|
388 |
+
max_new_tokens=512,
|
389 |
+
use_cache=True,
|
390 |
+
eos_token_id=processor.gen_terminator,
|
391 |
+
generation_config=generation_config,
|
392 |
+
use_whisper_encoder=True
|
393 |
+
)
|
394 |
+
|
395 |
```
|
396 |
|
397 |
```python
|
|
|
400 |
{
|
401 |
"role": "HUMAN",
|
402 |
"content": [
|
403 |
+
{"type": "audio", "audio": 'data/wavs/speechQA_sample.wav'},
|
404 |
],
|
405 |
},
|
406 |
]
|
407 |
+
generation_config = GenerationConfig.from_dict({
|
408 |
+
'output_hidden_states': True,
|
409 |
+
'return_dict_in_generate': True,
|
410 |
+
'no_repeat_ngram_size': 10}
|
411 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
412 |
|
413 |
+
outputs = model.generate(
|
414 |
+
**inputs,
|
415 |
+
max_new_tokens=512,
|
416 |
+
use_cache=True,
|
417 |
+
eos_token_id=processor.gen_terminator,
|
418 |
+
generation_config=generation_config,
|
419 |
+
use_whisper_encoder=False
|
420 |
+
)
|
421 |
|
422 |
+
generated_ids = outputs.sequences
|
423 |
+
generated_ids_trimmed = [
|
424 |
+
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
425 |
+
]
|
426 |
|
427 |
+
# speechQA result
|
428 |
+
output_text = processor.batch_decode(
|
429 |
+
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
430 |
+
)[0]
|
|
|
|
|
|
|
|
|
|
|
431 |
|
432 |
+
# for TTS
|
433 |
+
from modeling_bailing_talker import AudioDetokenizer
|
434 |
|
435 |
+
model_name_or_path = model.config._name_or_path
|
436 |
+
audio_detokenizer = AudioDetokenizer(
|
437 |
+
f'{model_name_or_path}/talker/audio_detokenizer.yaml',
|
438 |
+
flow_model_path=f'{model_name_or_path}/talker/flow.pt',
|
439 |
+
hifigan_model_path=f'{model_name_or_path}/talker/hift.pt'
|
440 |
+
)
|
441 |
+
spk_input = torch.load('data/spks/luna.pt')
|
442 |
+
thinker_reply_part = outputs.hidden_states[0][0] + outputs.hidden_states[0][-1]
|
443 |
+
# Setting thinker_reply_part to None allows the talker to operate as a standalone TTS model, independent of the language model.
|
444 |
+
audio_tokens = model.talker.omni_audio_generation(
|
445 |
+
output_text,
|
446 |
+
thinker_reply_part=thinker_reply_part, **spk_input)
|
447 |
+
waveform = audio_detokenizer.token2wav(audio_tokens, save_path='out.wav', **spk_input)
|
448 |
|
449 |
+
```
|
450 |
+
For detailed usage for ASR, SpeechQA, and TTS tasks, please refer to `test_audio_tasks.py`
|
|
|
|
|
|
|
|
|
|
|
|
|
451 |
|
452 |
+
### Image Generation & Edit
|
453 |
|
454 |
+
Ming-omni natively supports image generation and image editing. To use this function, you only need to add the corresponding parameters in the generate function.
|
455 |
|
456 |
+
```python
|
457 |
+
# Image generation mode currently limits the range of input pixels.
|
458 |
+
gen_input_pixels = 451584
|
459 |
+
processor.max_pixels = gen_input_pixels
|
460 |
+
processor.min_pixels = gen_input_pixels
|
461 |
+
|
462 |
+
def generate(messages, processor, model, **image_gen_param):
|
463 |
+
text = processor.apply_chat_template(messages, add_generation_prompt=True)
|
464 |
+
image_inputs, video_inputs, audio_inputs = processor.process_vision_info(messages)
|
465 |
+
|
466 |
+
inputs = processor(
|
467 |
+
text=[text],
|
468 |
+
images=image_inputs,
|
469 |
+
videos=video_inputs,
|
470 |
+
audios=audio_inputs,
|
471 |
+
return_tensors="pt",
|
472 |
+
).to(model.device)
|
473 |
+
|
474 |
+
for k in inputs.keys():
|
475 |
+
if k == "pixel_values" or k == "pixel_values_videos" or k == "audio_feats":
|
476 |
+
inputs[k] = inputs[k].to(dtype=torch.bfloat16)
|
477 |
+
|
478 |
+
print(image_gen_param)
|
479 |
+
image = model.generate(
|
480 |
+
**inputs,
|
481 |
+
image_gen=True,
|
482 |
+
**image_gen_param,
|
483 |
+
)
|
484 |
+
return image
|
485 |
|
486 |
+
```
|
|
|
|
|
|
|
|
|
|
|
|
|
487 |
|
488 |
+
Text-to-image
|
489 |
+
```python
|
490 |
+
messages = [
|
491 |
+
{
|
492 |
+
"role": "HUMAN",
|
493 |
+
"content": [
|
494 |
+
{"type": "text", "text": "Draw a girl with short hair."},
|
495 |
+
],
|
496 |
+
}
|
497 |
+
]
|
498 |
+
image = generate(
|
499 |
+
messages=messages, processor=processor, model=model,
|
500 |
+
image_gen_cfg=6.0, image_gen_steps=20, image_gen_width=480, image_gen_height=544
|
501 |
+
)
|
502 |
+
image.save("./t2i.jpg")
|
503 |
+
```
|
504 |
|
505 |
+
Edit
|
506 |
+
```python
|
507 |
+
messages = [
|
508 |
+
{
|
509 |
+
"role": "HUMAN",
|
510 |
+
"content": [
|
511 |
+
{"type": "image", "image": "samples/cake.jpg"},
|
512 |
+
{"type": "text", "text": "add a candle on top of the cake"},
|
513 |
+
],
|
514 |
+
}
|
515 |
+
]
|
516 |
+
image = generate(
|
517 |
+
messages=messages, processor=processor, model=model,
|
518 |
+
image_gen_cfg=6.0, image_gen_steps=20, image_gen_width=512, image_gen_height=512
|
519 |
+
)
|
520 |
+
image.save("./edit.jpg")
|
521 |
+
```
|
522 |
|
|
|
|
|
523 |
|
524 |
+
## License and Legal Disclaimer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
525 |
|
526 |
+
This code repository is licensed under the [MIT License](../LICENSE), and the Legal Disclaimer is located in the [LEGAL.md file](../LEGAL.md) under the project's root directory.
|
527 |
|
528 |
|
|
|
|
|
529 |
|
audio_detokenizer/cli/model.py
CHANGED
@@ -36,9 +36,9 @@ class AudioDetokenizerModel:
|
|
36 |
self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
|
37 |
self.hift.to(self.device).eval()
|
38 |
|
39 |
-
def inference(self,
|
40 |
-
|
41 |
-
prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32), is_en=False):
|
42 |
|
43 |
torch.cuda.synchronize()
|
44 |
t0 = time.time()
|
@@ -48,11 +48,11 @@ class AudioDetokenizerModel:
|
|
48 |
|
49 |
tts_mel = self.flow.inference(token=tts_speech_token.to(self.device),
|
50 |
token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device),
|
51 |
-
prompt_token=
|
52 |
-
prompt_token_len=
|
53 |
prompt_feat=prompt_speech_feat.to(self.device),
|
54 |
prompt_feat_len=prompt_speech_feat_len.to(self.device),
|
55 |
-
embedding=
|
56 |
torch.cuda.synchronize()
|
57 |
|
58 |
tts_speech = self.hift.inference(mel=tts_mel).cpu()
|
|
|
36 |
self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
|
37 |
self.hift.to(self.device).eval()
|
38 |
|
39 |
+
def inference(self, vp_emb, tts_speech_token,
|
40 |
+
prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
|
41 |
+
prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32), is_en=False, **kwargs):
|
42 |
|
43 |
torch.cuda.synchronize()
|
44 |
t0 = time.time()
|
|
|
48 |
|
49 |
tts_mel = self.flow.inference(token=tts_speech_token.to(self.device),
|
50 |
token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device),
|
51 |
+
prompt_token=prompt_speech_token.to(self.device),
|
52 |
+
prompt_token_len=prompt_speech_token_len.to(self.device),
|
53 |
prompt_feat=prompt_speech_feat.to(self.device),
|
54 |
prompt_feat_len=prompt_speech_feat_len.to(self.device),
|
55 |
+
embedding=vp_emb.to(self.device).to(self.dtype)).float()
|
56 |
torch.cuda.synchronize()
|
57 |
|
58 |
tts_speech = self.hift.inference(mel=tts_mel).cpu()
|
bailingmm_utils.py
CHANGED
@@ -170,7 +170,7 @@ def sample_frames(num_frames, total_frames, sample="random"):
|
|
170 |
padded_frame_indices = [frame_indices[-1]] * num_frames
|
171 |
padded_frame_indices[:len(frame_indices)] = frame_indices
|
172 |
frame_indices = padded_frame_indices
|
173 |
-
elif sample == "uniform":
|
174 |
frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
|
175 |
if len(frame_indices) < num_frames:
|
176 |
frame_indices = [
|
@@ -278,12 +278,75 @@ def _read_video_decord(
|
|
278 |
frame_indices = sample_frames(
|
279 |
num_frames=num_frames, total_frames=total_frames, sample=sample_method
|
280 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
|
282 |
-
video = vr.get_batch(frame_indices).asnumpy()
|
283 |
video = torch.tensor(video).permute(0, 3, 1, 2) # Convert to TCHW format
|
284 |
sample_fps = num_frames / max(total_frames, 1e-6) * video_fps
|
285 |
return video, sample_fps
|
286 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
VIDEO_READER_BACKENDS = {
|
288 |
"decord": _read_video_decord,
|
289 |
"torchvision": _read_video_torchvision,
|
@@ -410,6 +473,19 @@ def extract_vision_info(conversations: list[dict] | list[list[dict]]) -> list[di
|
|
410 |
or ele["type"] in ("image", "image_url", "video")
|
411 |
):
|
412 |
vision_infos.append(ele)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
413 |
return vision_infos
|
414 |
|
415 |
def process_vision_info(
|
|
|
170 |
padded_frame_indices = [frame_indices[-1]] * num_frames
|
171 |
padded_frame_indices[:len(frame_indices)] = frame_indices
|
172 |
frame_indices = padded_frame_indices
|
173 |
+
elif sample == "uniform" or sample == "adaptive":
|
174 |
frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
|
175 |
if len(frame_indices) < num_frames:
|
176 |
frame_indices = [
|
|
|
278 |
frame_indices = sample_frames(
|
279 |
num_frames=num_frames, total_frames=total_frames, sample=sample_method
|
280 |
)
|
281 |
+
if sample_method == "adaptive" and len(frame_indices) > 64:
|
282 |
+
frames_indices_selected = select_frames_based_on_query(vr, frame_indices, ele) # query的扩模态采样结果
|
283 |
+
indices = np.linspace(0, len(frame_indices) - 1, len(frame_indices)//2, dtype=int)
|
284 |
+
frame_indices = np.array(frame_indices)[indices].tolist()
|
285 |
+
frames_indices_selected_sort = np.sort(frame_indices + frames_indices_selected[:(num_frames - len(frame_indices))].tolist()).tolist()
|
286 |
+
video = vr.get_batch(frames_indices_selected_sort).asnumpy()
|
287 |
+
else:
|
288 |
+
video = vr.get_batch(frame_indices).asnumpy()
|
289 |
|
290 |
+
# video = vr.get_batch(frame_indices).asnumpy()
|
291 |
video = torch.tensor(video).permute(0, 3, 1, 2) # Convert to TCHW format
|
292 |
sample_fps = num_frames / max(total_frames, 1e-6) * video_fps
|
293 |
return video, sample_fps
|
294 |
|
295 |
+
def select_frames_based_on_query(vr, frame_indices, ele):
|
296 |
+
import sys
|
297 |
+
sys.path.join("./longvu")
|
298 |
+
'''
|
299 |
+
This LongVU model (https://github.com/Vision-CAIR/LongVU) computes cross-modal relevance
|
300 |
+
between user queries and video frames for the purpose of frame selection.
|
301 |
+
It can also be replaced with other text/visual encoders to achieve the same effect.
|
302 |
+
To maintain consistency in the repository structure, this module has not been included in the repository directory for now.
|
303 |
+
If needed for evaluation, simply import this module.
|
304 |
+
'''
|
305 |
+
from longvu.constants import (
|
306 |
+
DEFAULT_IMAGE_TOKEN,
|
307 |
+
IMAGE_TOKEN_INDEX,
|
308 |
+
)
|
309 |
+
from longvu.conversation import conv_templates, SeparatorStyle
|
310 |
+
from longvu.mm_datautils import (
|
311 |
+
KeywordsStoppingCriteria,
|
312 |
+
process_images,
|
313 |
+
tokenizer_image_token,
|
314 |
+
)
|
315 |
+
tokenizer, model, image_processor = ele["tokenizer"], ele["model"], ele["image_processor"]
|
316 |
+
|
317 |
+
# 考虑在这里扩展frame_indices
|
318 |
+
video = vr.get_batch(frame_indices).asnumpy() # (21, 320, 568, 3)
|
319 |
+
|
320 |
+
image_sizes = [video[0].shape[:2]] # [(320, 568)]
|
321 |
+
video = process_images(video, image_processor, model.config) # len(video)=2, 第一个 torch.Size([623, 3, 384, 384]),第二个 torch.Size([623, 3, 378, 378])
|
322 |
+
video = [item.unsqueeze(0) for item in video] # len(video)=2, 第一个 torch.Size([1, 623, 3, 384, 384]),第二个 torch.Size([1, 623, 3, 378, 378])
|
323 |
+
|
324 |
+
qs = DEFAULT_IMAGE_TOKEN + "\n" + ele["text"]
|
325 |
+
conv = conv_templates["qwen"].copy()
|
326 |
+
conv.append_message(conv.roles[0], qs)
|
327 |
+
conv.append_message(conv.roles[1], None)
|
328 |
+
prompt = conv.get_prompt()
|
329 |
+
|
330 |
+
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(model.device) # torch.Size([1, 26])
|
331 |
+
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 # '<|im_end|>'
|
332 |
+
keywords = [stop_str]
|
333 |
+
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
|
334 |
+
|
335 |
+
with torch.inference_mode():
|
336 |
+
output_ids = model.generate(
|
337 |
+
input_ids,
|
338 |
+
images=video,
|
339 |
+
image_sizes=image_sizes,
|
340 |
+
do_sample=False,
|
341 |
+
temperature=0.2,
|
342 |
+
max_new_tokens=128,
|
343 |
+
use_cache=True,
|
344 |
+
stopping_criteria=[stopping_criteria],
|
345 |
+
) # torch.Size([1, 128])
|
346 |
+
|
347 |
+
selected_indices = np.array(frame_indices)[output_ids.cpu().numpy()]
|
348 |
+
return selected_indices
|
349 |
+
|
350 |
VIDEO_READER_BACKENDS = {
|
351 |
"decord": _read_video_decord,
|
352 |
"torchvision": _read_video_torchvision,
|
|
|
473 |
or ele["type"] in ("image", "image_url", "video")
|
474 |
):
|
475 |
vision_infos.append(ele)
|
476 |
+
# 把视频的 query_text 也加进来
|
477 |
+
if "text" in ele: text = ele["text"]
|
478 |
+
if "video" in ele and ele["sample"] == "adaptive":
|
479 |
+
tokenizer = ele["tokenizer"]
|
480 |
+
model = ele["model"]
|
481 |
+
image_processor = ele["image_processor"]
|
482 |
+
for ele in vision_infos:
|
483 |
+
if "video" in ele and ele["sample"] == "adaptive":
|
484 |
+
ele["text"] = text
|
485 |
+
ele["tokenizer"] = tokenizer
|
486 |
+
ele["model"] = model
|
487 |
+
ele["image_processor"] = image_processor
|
488 |
+
return vision_infos
|
489 |
return vision_infos
|
490 |
|
491 |
def process_vision_info(
|
data/matcha_tts-0.0.5.1-cp310-cp310-linux_x86_64.whl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e316b130dc66906d71bcb6bdc0d9c43c860901204cb64a4a3adb903dc1edfe90
|
3 |
+
size 576473
|
data/spks/luna.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:22f9a6b446e0849d4233ac25d11ebf7e96b30746e16482890a6367c8a3b6e05f
|
3 |
+
size 32094
|
figures/ant-bailing.png
ADDED
![]() |
Git LFS Details
|
figures/cases/animal.png
ADDED
![]() |
Git LFS Details
|
figures/cases/audioqa_audio.wav
ADDED
Binary file (64.8 kB). View file
|
|
figures/cases/audioqa_video.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:74b65d9bec7f83e487b7f923076c01d476dd2ef7ed83928a696ab6f88c7751b7
|
3 |
+
size 776184
|
figures/cases/demo_430v2.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a1cf17a700173885c60c3b5fb79a13618b5075e5fb8781f98f4c67118c3933b3
|
3 |
+
size 18945212
|
figures/cases/document.png
ADDED
![]() |
Git LFS Details
|
figures/cases/document_parse.png
ADDED
![]() |
Git LFS Details
|
figures/cases/grounding1.png
ADDED
![]() |
Git LFS Details
|
figures/cases/grounding1_vis.png
ADDED
![]() |
Git LFS Details
|
figures/cases/grounding2.png
ADDED
![]() |
Git LFS Details
|
figures/cases/grounding2_vis.png
ADDED
![]() |
Git LFS Details
|
figures/cases/gui.png
ADDED
![]() |
Git LFS Details
|
figures/cases/knowledge1.png
ADDED
![]() |
Git LFS Details
|
figures/cases/knowledge2.png
ADDED
![]() |
Git LFS Details
|
figures/cases/ocr.png
ADDED
![]() |
Git LFS Details
|
figures/cases/plant.png
ADDED
![]() |
Git LFS Details
|
figures/cases/reasoning.png
ADDED
![]() |
Git LFS Details
|
figures/cases/s2s.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5fd492db166436757f6774e0341245f05a24e2c81ab792112816a19b5e239589
|
3 |
+
size 963109
|
figures/ming.png
ADDED
![]() |
Git LFS Details
|
figures/performance.png
ADDED
![]() |
Git LFS Details
|
figures/unified_samples.png
ADDED
![]() |
Git LFS Details
|
model-00001-of-00016.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4991703896
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ad46ac358a4008f04a1a30cc7aaa380ea9bb768c8a95fb056a3d0adbbdaa1fc0
|
3 |
size 4991703896
|
model-00002-of-00016.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5000002488
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9af70387993676a39ded151d93765ea1207b21ea370df24a9e37b5bfdb115cc1
|
3 |
size 5000002488
|
model-00015-of-00016.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4987365648
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2502c41990fddac43223d09f378918252fa4e7fce71120e9effbd9eb7e38e84a
|
3 |
size 4987365648
|
model-00016-of-00016.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1360124632
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b55e94be9b6e05036a5405bf15e7dcdb05ba498272d7d8fe9074dfb40e22fd84
|
3 |
size 1360124632
|
modeling_bailing_talker.py
CHANGED
@@ -141,9 +141,10 @@ class BailingTalkerForConditionalGeneration(PreTrainedModel):
|
|
141 |
logits=logits,
|
142 |
)
|
143 |
|
144 |
-
def sample(self, logits, topk=20, filter_value=-float("Inf")):
|
145 |
logits = logits.reshape(1, -1) # [1, V]
|
146 |
indices_to_remove = logits < torch.topk(logits, topk)[0][..., -1, None]
|
|
|
147 |
logits[indices_to_remove] = filter_value
|
148 |
token_id = torch.multinomial(torch.softmax(logits, dim=-1), num_samples=1).to(torch.long)
|
149 |
return token_id
|
@@ -161,23 +162,33 @@ class BailingTalkerForConditionalGeneration(PreTrainedModel):
|
|
161 |
thinker_length,
|
162 |
vp_emb=None,
|
163 |
thinker_reply_part=None,
|
|
|
|
|
164 |
):
|
165 |
|
166 |
text_input_part = self.tokenizer.encode(tts_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
# audio_prefix and text_prefix for first step generation
|
168 |
talker_text_prefix = (
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
)
|
174 |
# the rest of input_text
|
175 |
talker_text_input_part = (
|
176 |
-
|
|
|
177 |
self.tokenizer.encode("<text_eos>") +
|
178 |
self.tokenizer.encode("<text_pad>")
|
179 |
)
|
180 |
|
|
|
181 |
talker_text_prefix = torch.tensor(talker_text_prefix).reshape(1, -1).to(self.device)
|
182 |
|
183 |
|
@@ -192,6 +203,7 @@ class BailingTalkerForConditionalGeneration(PreTrainedModel):
|
|
192 |
thinker_reply_part=thinker_reply_part,
|
193 |
thinker_reply_length=torch.tensor([thinker_length]).to(self.device),
|
194 |
thinker_prefix_insert_loc=torch.tensor([len(prompt) + 1]).to(self.device) if thinker_reply_part is not None else None,
|
|
|
195 |
)
|
196 |
|
197 |
audio_token = [ele - len(self.tokenizer) for ele in audio_token]
|
@@ -257,6 +269,9 @@ class BailingTalkerForConditionalGeneration(PreTrainedModel):
|
|
257 |
vp_emb=None,
|
258 |
thinker_reply_part=None,
|
259 |
max_length=50,
|
|
|
|
|
|
|
260 |
):
|
261 |
|
262 |
# thinker_reply_part: [B, T, d]
|
@@ -302,6 +317,8 @@ class BailingTalkerForConditionalGeneration(PreTrainedModel):
|
|
302 |
thinker_length=thinker_length,
|
303 |
vp_emb=vp_emb,
|
304 |
thinker_reply_part=thinker_reply_part,
|
|
|
|
|
305 |
)
|
306 |
audio_tokens.append(audio_tokens_piece)
|
307 |
return audio_tokens
|
@@ -319,10 +336,13 @@ class BailingTalkerForConditionalGeneration(PreTrainedModel):
|
|
319 |
thinker_reply_part: Optional[torch.FloatTensor] = None,
|
320 |
thinker_reply_length: Optional[torch.FloatTensor] = None,
|
321 |
thinker_prefix_insert_loc: Optional[torch.LongTensor] = None,
|
|
|
|
|
322 |
):
|
323 |
result = []
|
324 |
step = 0
|
325 |
eos_id = self.tokenizer.encode("<audio_eos>")[0]
|
|
|
326 |
while step < 1000:
|
327 |
if step == 0:
|
328 |
talker_audio_input_ids = talker_audio_prefix
|
@@ -356,10 +376,16 @@ class BailingTalkerForConditionalGeneration(PreTrainedModel):
|
|
356 |
# 采样
|
357 |
logits = outputs.logits[:, -1, :]
|
358 |
|
359 |
-
|
|
|
360 |
if next_token.item() == eos_id:
|
361 |
break
|
362 |
-
|
|
|
|
|
|
|
|
|
|
|
363 |
step += 1
|
364 |
|
365 |
return result
|
@@ -374,13 +400,14 @@ class AudioDetokenizer:
|
|
374 |
self.model.load(flow_model_path, hifigan_model_path)
|
375 |
self.sr = 22050
|
376 |
|
377 |
-
def token2wav(self, audio_tokens,
|
378 |
assert isinstance(audio_tokens, list), f"audio_tokens should be list"
|
379 |
speech_list = []
|
380 |
for audio_token in audio_tokens:
|
381 |
-
model_input = {"tts_speech_token": audio_token
|
382 |
-
|
383 |
-
|
|
|
384 |
|
385 |
silent_dur = 0.02
|
386 |
silent_tensor = torch.Tensor([0.0] * int(self.sr * silent_dur))
|
|
|
141 |
logits=logits,
|
142 |
)
|
143 |
|
144 |
+
def sample(self, logits, topk=20, filter_value=-float("Inf"), stopping_criteria=False, eos_id=151666):
|
145 |
logits = logits.reshape(1, -1) # [1, V]
|
146 |
indices_to_remove = logits < torch.topk(logits, topk)[0][..., -1, None]
|
147 |
+
indices_to_remove[0][eos_id] = True if stopping_criteria is True else indices_to_remove[0][eos_id]
|
148 |
logits[indices_to_remove] = filter_value
|
149 |
token_id = torch.multinomial(torch.softmax(logits, dim=-1), num_samples=1).to(torch.long)
|
150 |
return token_id
|
|
|
162 |
thinker_length,
|
163 |
vp_emb=None,
|
164 |
thinker_reply_part=None,
|
165 |
+
prompt_text=None,
|
166 |
+
prompt_speech_token=None,
|
167 |
):
|
168 |
|
169 |
text_input_part = self.tokenizer.encode(tts_text)
|
170 |
+
|
171 |
+
prompt_text_input_part = self.tokenizer.encode(prompt_text)
|
172 |
+
prompt_speech_token = prompt_speech_token[0].tolist()
|
173 |
+
prompt_speech_token_bpe = self.s3bpe_tokenizer.encode(prompt_speech_token)[0]
|
174 |
+
prompt_speech_token_bpe = (torch.tensor(prompt_speech_token_bpe) + len(self.tokenizer) ).tolist()
|
175 |
+
|
176 |
# audio_prefix and text_prefix for first step generation
|
177 |
talker_text_prefix = (
|
178 |
+
prompt +
|
179 |
+
prefix_from_thinker +
|
180 |
+
vp +
|
181 |
+
prompt_text_input_part[:1]
|
182 |
)
|
183 |
# the rest of input_text
|
184 |
talker_text_input_part = (
|
185 |
+
prompt_text_input_part[1:] +
|
186 |
+
text_input_part +
|
187 |
self.tokenizer.encode("<text_eos>") +
|
188 |
self.tokenizer.encode("<text_pad>")
|
189 |
)
|
190 |
|
191 |
+
|
192 |
talker_text_prefix = torch.tensor(talker_text_prefix).reshape(1, -1).to(self.device)
|
193 |
|
194 |
|
|
|
203 |
thinker_reply_part=thinker_reply_part,
|
204 |
thinker_reply_length=torch.tensor([thinker_length]).to(self.device),
|
205 |
thinker_prefix_insert_loc=torch.tensor([len(prompt) + 1]).to(self.device) if thinker_reply_part is not None else None,
|
206 |
+
prompt_wav_token=prompt_speech_token_bpe,
|
207 |
)
|
208 |
|
209 |
audio_token = [ele - len(self.tokenizer) for ele in audio_token]
|
|
|
269 |
vp_emb=None,
|
270 |
thinker_reply_part=None,
|
271 |
max_length=50,
|
272 |
+
prompt_text=None,
|
273 |
+
prompt_speech_token=None,
|
274 |
+
**kwargs,
|
275 |
):
|
276 |
|
277 |
# thinker_reply_part: [B, T, d]
|
|
|
317 |
thinker_length=thinker_length,
|
318 |
vp_emb=vp_emb,
|
319 |
thinker_reply_part=thinker_reply_part,
|
320 |
+
prompt_text=prompt_text,
|
321 |
+
prompt_speech_token=prompt_speech_token,
|
322 |
)
|
323 |
audio_tokens.append(audio_tokens_piece)
|
324 |
return audio_tokens
|
|
|
336 |
thinker_reply_part: Optional[torch.FloatTensor] = None,
|
337 |
thinker_reply_length: Optional[torch.FloatTensor] = None,
|
338 |
thinker_prefix_insert_loc: Optional[torch.LongTensor] = None,
|
339 |
+
prompt_wav_token: List = [],
|
340 |
+
min_new_token = 10,
|
341 |
):
|
342 |
result = []
|
343 |
step = 0
|
344 |
eos_id = self.tokenizer.encode("<audio_eos>")[0]
|
345 |
+
prompt_wav_token_len = len(prompt_wav_token)
|
346 |
while step < 1000:
|
347 |
if step == 0:
|
348 |
talker_audio_input_ids = talker_audio_prefix
|
|
|
376 |
# 采样
|
377 |
logits = outputs.logits[:, -1, :]
|
378 |
|
379 |
+
stopping_criteria = position_ids.item() < prompt_wav_token_len + min_new_token
|
380 |
+
next_token = self.sample(logits, stopping_criteria=stopping_criteria )
|
381 |
if next_token.item() == eos_id:
|
382 |
break
|
383 |
+
|
384 |
+
if len(prompt_wav_token) > 0:
|
385 |
+
next_token = torch.tensor([[prompt_wav_token[0]]]).to(logits.device)
|
386 |
+
prompt_wav_token = prompt_wav_token[1:]
|
387 |
+
else:
|
388 |
+
result.append(next_token.item())
|
389 |
step += 1
|
390 |
|
391 |
return result
|
|
|
400 |
self.model.load(flow_model_path, hifigan_model_path)
|
401 |
self.sr = 22050
|
402 |
|
403 |
+
def token2wav(self, audio_tokens, save_path=None, **kwargs):
|
404 |
assert isinstance(audio_tokens, list), f"audio_tokens should be list"
|
405 |
speech_list = []
|
406 |
for audio_token in audio_tokens:
|
407 |
+
model_input = {"tts_speech_token": audio_token}
|
408 |
+
kwargs.update(**model_input)
|
409 |
+
|
410 |
+
model_output = self.model.inference(**kwargs)
|
411 |
|
412 |
silent_dur = 0.02
|
413 |
silent_tensor = torch.Tensor([0.0] * int(self.sr * silent_dur))
|
modeling_bailingmm.py
CHANGED
@@ -5,6 +5,7 @@
|
|
5 |
import copy
|
6 |
from dataclasses import dataclass
|
7 |
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
|
8 |
|
9 |
import numpy as np
|
10 |
import torch
|
@@ -495,9 +496,18 @@ class BailingMMNativeForConditionalGeneration(PreTrainedModel):
|
|
495 |
from diffusion.sana_loss import SANALoss
|
496 |
import os
|
497 |
from safetensors.torch import load_file
|
498 |
-
|
499 |
-
|
500 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
501 |
self.query_tokens_dict = nn.ParameterDict()
|
502 |
self.img_gen_scales = [4, 8, 16]
|
503 |
for scale in self.img_gen_scales:
|
@@ -535,7 +545,7 @@ class BailingMMNativeForConditionalGeneration(PreTrainedModel):
|
|
535 |
self.diffusion_loss.to(self.model.device)
|
536 |
#self.norm_query_embeds = True
|
537 |
# load connector
|
538 |
-
self.connector = AutoModelForCausalLM.from_pretrained(
|
539 |
for layer in self.connector.model.layers:
|
540 |
layer.self_attn.is_causal = False
|
541 |
self.connector.to(self.model.device)
|
@@ -555,4 +565,19 @@ class BailingMMNativeForConditionalGeneration(PreTrainedModel):
|
|
555 |
self.proj_out.load_state_dict(modified_state_dict_out, strict=True)
|
556 |
self.proj_in.to(self.model.device)
|
557 |
self.proj_out.to(self.model.device)
|
558 |
-
self.loaded_image_gen_modules = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
import copy
|
6 |
from dataclasses import dataclass
|
7 |
from typing import Any, Dict, List, Optional, Tuple, Union
|
8 |
+
import os
|
9 |
|
10 |
import numpy as np
|
11 |
import torch
|
|
|
496 |
from diffusion.sana_loss import SANALoss
|
497 |
import os
|
498 |
from safetensors.torch import load_file
|
499 |
+
if os.path.exists(inference_model_path):
|
500 |
+
temp_state_dict = load_file(os.path.join(inference_model_path, 'mlp', 'model.safetensors'))
|
501 |
+
else:
|
502 |
+
from huggingface_hub import hf_hub_download
|
503 |
+
from safetensors import safe_open
|
504 |
+
safetensors_path = hf_hub_download(
|
505 |
+
repo_id=inference_model_path,
|
506 |
+
filename="model.safetensors",
|
507 |
+
subfolder="mlp"
|
508 |
+
)
|
509 |
+
with safe_open(safetensors_path, framework="pt") as f:
|
510 |
+
temp_state_dict = {key: f.get_tensor(key) for key in f.keys()}
|
511 |
self.query_tokens_dict = nn.ParameterDict()
|
512 |
self.img_gen_scales = [4, 8, 16]
|
513 |
for scale in self.img_gen_scales:
|
|
|
545 |
self.diffusion_loss.to(self.model.device)
|
546 |
#self.norm_query_embeds = True
|
547 |
# load connector
|
548 |
+
self.connector = AutoModelForCausalLM.from_pretrained(inference_model_path, subfolder='connector')
|
549 |
for layer in self.connector.model.layers:
|
550 |
layer.self_attn.is_causal = False
|
551 |
self.connector.to(self.model.device)
|
|
|
565 |
self.proj_out.load_state_dict(modified_state_dict_out, strict=True)
|
566 |
self.proj_in.to(self.model.device)
|
567 |
self.proj_out.to(self.model.device)
|
568 |
+
self.loaded_image_gen_modules = True
|
569 |
+
|
570 |
+
@classmethod
|
571 |
+
def from_pretrained(
|
572 |
+
cls,
|
573 |
+
pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
|
574 |
+
*model_args,
|
575 |
+
**kwargs,
|
576 |
+
):
|
577 |
+
model = super().from_pretrained(
|
578 |
+
pretrained_model_name_or_path,
|
579 |
+
*model_args,
|
580 |
+
**kwargs,
|
581 |
+
)
|
582 |
+
model.load_image_gen_modules(pretrained_model_name_or_path)
|
583 |
+
return model
|
requirements.txt
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch==2.1.2
|
2 |
+
torchvision==0.16.2
|
3 |
+
torchaudio==2.1.2
|
4 |
+
funasr==1.1.14
|
5 |
+
flash-attn==2.3.6
|
6 |
+
peft==0.13.2
|
7 |
+
diffusers #==0.33
|
8 |
+
tokenizers==0.20.3
|
9 |
+
transformers==4.45.0
|
10 |
+
grouped_gemm==0.1.4
|
11 |
+
decord==0.6.0
|
12 |
+
hyperpyyaml
|
13 |
+
modelscope
|
14 |
+
onnxruntime
|
15 |
+
inflect
|
16 |
+
conformer
|
17 |
+
diffusers
|
18 |
+
lightning
|
19 |
+
gdown
|
20 |
+
openai-whisper==20240930
|
test_audio_tasks.py
CHANGED
@@ -94,9 +94,9 @@ class BailingMMInfer:
|
|
94 |
|
95 |
if self.model.talker is not None and output_audio:
|
96 |
thinker_reply_part = outputs.hidden_states[0][0] + outputs.hidden_states[0][-1]
|
97 |
-
|
98 |
-
audio_tokens = self.model.talker.omni_audio_generation(output_text,
|
99 |
-
waveform = self.audio_detokenizer.token2wav(audio_tokens,
|
100 |
return output_text, waveform
|
101 |
return output_text
|
102 |
|
|
|
94 |
|
95 |
if self.model.talker is not None and output_audio:
|
96 |
thinker_reply_part = outputs.hidden_states[0][0] + outputs.hidden_states[0][-1]
|
97 |
+
spk_input = self.spk_info.get(speaker, 'luna')
|
98 |
+
audio_tokens = self.model.talker.omni_audio_generation(output_text, thinker_reply_part=thinker_reply_part, **spk_input)
|
99 |
+
waveform = self.audio_detokenizer.token2wav(audio_tokens, save_path=output_audio_path, **spk_input)
|
100 |
return output_text, waveform
|
101 |
return output_text
|
102 |
|