LandyGuo commited on 17 days ago

Commit

dd5f831

1 Parent(s): 81a8221

update v20250516 ckpts

Files changed (35) hide show

.gitattributes +3 -0
README.md +326 -125
audio_detokenizer/cli/model.py +6 -6
bailingmm_utils.py +78 -2
data/matcha_tts-0.0.5.1-cp310-cp310-linux_x86_64.whl +3 -0
data/spks/luna.pt +2 -2
figures/ant-bailing.png +3 -0
figures/cases/animal.png +3 -0
figures/cases/audioqa_audio.wav +0 -0
figures/cases/audioqa_video.mp4 +3 -0
figures/cases/demo_430v2.mp4 +3 -0
figures/cases/document.png +3 -0
figures/cases/document_parse.png +3 -0
figures/cases/grounding1.png +3 -0
figures/cases/grounding1_vis.png +3 -0
figures/cases/grounding2.png +3 -0
figures/cases/grounding2_vis.png +3 -0
figures/cases/gui.png +3 -0
figures/cases/knowledge1.png +3 -0
figures/cases/knowledge2.png +3 -0
figures/cases/ocr.png +3 -0
figures/cases/plant.png +3 -0
figures/cases/reasoning.png +3 -0
figures/cases/s2s.mp4 +3 -0
figures/ming.png +3 -0
figures/performance.png +3 -0
figures/unified_samples.png +3 -0
model-00001-of-00016.safetensors +1 -1
model-00002-of-00016.safetensors +1 -1
model-00015-of-00016.safetensors +1 -1
model-00016-of-00016.safetensors +1 -1
modeling_bailing_talker.py +39 -12
modeling_bailingmm.py +30 -5
requirements.txt +20 -0
test_audio_tasks.py +3 -3

.gitattributes CHANGED Viewed

@@ -38,5 +38,8 @@ data/wavs/BAC009S0915W0292.wav filter=lfs diff=lfs merge=lfs -text
 out.wav filter=lfs diff=lfs merge=lfs -text
 talker/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 data/openai_whisper-20240930-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text

 out.wav filter=lfs diff=lfs merge=lfs -text
 talker/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 data/openai_whisper-20240930-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+data/matcha_tts-0.0.5.1-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,19 +1,179 @@
----
-license: mit
-base_model:
-- inclusionAI/Ling-lite
----
-# Ming-Lite-Omni-Preview
-### Model Description
-Ming-Lite-Omni-Preview employs a unified Mixture-of-Experts (MoE) framework for multimodal sequence modeling, which empowers [Ling](https://github.com/inclusionAI/Ling) LLMs to acquire comprehensive cross-modal understanding and generation capabilities. Specifically, Ming-Lite-Omni-Preview can process arbitrary combinations of audio, video, image, and text modalities as input, generating multimodal sequences interleaving with audio, image, or text outputs, thereby enabling an advanced and interactive realtime experience. To naturely handle the diverse modalities, we have enhanced Ling-Lite-MoE by incorporating modality-specific routers for each modality. As a result, Ming-Lite-Omni-Preview excels at handling information from diverse modalities and is highly scalable.
-### Key Features
-- **Omni and Novel MoE Architecture**: An innovative Omni architecture based on Mixture of Experts (MoE) that achieves competive performance across multiple modality benchmarks.
-- **Video understanding**: Supports KV-Cache dynamic compression of visual tokens. While supporting the ability to understand long videos of hours, it can also provide more detailed understanding of short videos of a few seconds.
-- **Natural Speech Generation and Fine-grained Voice Dialogue**: Supports dialect understanding and generation in end-to-end conversations, enables one-shot voice cloning, and enhances prosody through audio tokenizer compression
 ## Model Downloads
@@ -22,24 +182,41 @@ You can download the model from both Huggingface and ModelScope.
 <div align="center">
-| **Model**                            |     **Input modality**     | **Oput modality** |                                                                             **Download**                                                                             |
-|:-------------------------------------|:--------------------------:|:-----------------:|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
-| Ming-Lite-Omni-Preview               |  Image,text,viedio,audio   | Image,text,audio  | [🤗 HuggingFace](https://huggingface.co/inclusionAI/Ming-Lite-Omni-Preview) <br>[🤖 ModelScope](https://www.modelscope.cn/models/inclusionAI/Ming-Lite-Omni-Preview) |
 </div>
-## Quickstart
-Please download our model following [Model Downloads](#model-downloads), then you can refer to the following codes to run Ming-Lite-Omni-Preview model.
 ```python
 import os
-from transformers import AutoProcessor
 from modeling_bailingmm import BailingMMNativeForConditionalGeneration
 # build model
 model = BailingMMNativeForConditionalGeneration.from_pretrained(
-    "inclusionAI/Ming-Lite-Omni-Preview",
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True
 ).to("cuda")
@@ -47,7 +224,7 @@ model = BailingMMNativeForConditionalGeneration.from_pretrained(
 assets_path = YOUR_ASSETS_PATH
 # build processor
-processor = AutoProcessor.from_pretrained("inclusionAI/Ming-Lite-Omni-Preview", trust_remote_code=True)
 ```
 ```python
@@ -166,11 +343,13 @@ for k in inputs.keys():
         inputs[k] = inputs[k].to(dtype=torch.bfloat16)
 # call generate
 generated_ids = model.generate(
     **inputs,
     max_new_tokens=512,
-    use_cache=False,
     eos_token_id=processor.gen_terminator,
 )
 generated_ids_trimmed = [
         out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
@@ -181,6 +360,7 @@ output_text = processor.batch_decode(
 print(output_text)
 ```
 ```python
 # ASR
@@ -193,8 +373,25 @@ messages = [
         ],
     },
 ]
-outputs = model.generate(messages, max_new_tokens=512)
-print(outputs)
 ```
 ```python
@@ -203,126 +400,130 @@ messages = [
     {
         "role": "HUMAN",
         "content": [
-            {"type": "audio", "audio": 'data/wavs/BAC009S0915W0292.wav'},
         ],
     },
 ]
-outputs = model.generate(messages, max_new_tokens=512, speaker='luna', output_audio_path='out.wav', output_audio=True)
-print(outputs)
-```
-## Evaluation
-### Image benchmark
-<div align="center">
-| Benchmarks        | Ming-Lite-Omni-Preview |   Qwen2.5-VL-7B-Instruct    | InternVL2.5-8B-MPO |
-|:------------------|:----------------------:|:---------------------------:|:------------------:|
-| AI2D              |         83.84          |            83.9             |    <b>84.5</b>     |
-| HallusionBench    |      <b>54.68</b>      |            51.9             |        51.7        |
-| MMBench_TEST_V11  |         79.63          |         <b>84.3</b>         |        82.0        |
-| MMMU              |          57.0          |         <b>58.6</b>         |        54.8        |
-| MMStar            |          62.0          |            63.9             |    <b>65.2</b>     |
-| MMVet             |      <b>73.6</b>       |            67.1             |        68.1        |
-| MathVista         |      <b>69.0</b>       |            68.2             |        67.9        |
-| OCRBench          |          87.9          |            86.4             |    <b>88.2</b>     |
-| Average           |      <b>70.96</b>      |            70.5             |        70.3        |
-</div>
-#### Object Recognition
-<div align="center">
-| Object Recognition          | Ming-Lite-Omni-Preview | Qwen2.5-VL-7B | InternVL-2.5-8B |
-|:----------------------------|:----------------------:|:-------------:|:---------------:|
-| Plants                      |          52.1          |  <b>55.3</b>  |      32.8       |
-| Animals                     |          52.6          |  <b>54.8</b>  |      36.5       |
-| Home appliances & furniture |          93.5          |  <b>97.4</b>  |      90.9       |
-| Personal Electronics        |      <b>96.1</b>       |     95.1      |      93.2       |
-| Food & Ingredients          |          57.5          |  <b>60.0</b>  |      48.7       |
-| Tableware                   |        <b>96.6         |     94.9      |      88.1       |
-| Vehicles                    |          31.9          |  <b>40.9</b>  |      31.9       |
-| Average                     |          68.6          |  <b>71.2</b>  |      60.3       |
-</div>
-### Video benchmark
-<div align="center">
-| Benchmarks         |  Ming-Lite-Omni-Preview  |   Qwen2.5VL-7B   |
-|:-------------------|:------------------------:|:----------------:|
-| VideoMME wo/w sub. |        63.9/67.6         | <b>65.1/71.6</b> |
-| MVBench            |           67.0           |   <b>72.0</b>    |
-| Video-MMMU         |           45.4           |   <b>47.44</b>   |
-| LongVideoBench     |           53.7           |   <b>60.0</b>    |
-</div>
-### Audio benchmark
-#### SpeechQA
-<div align="center">
-| Model                    | AlpacaEval  | CommonEval  |    SD-QA     |     MMSU     |  OpenBookQA  |    IFEval    |   AdvBench    |
-|:-------------------------|:-----------:|:-----------:|:------------:|:------------:|:------------:|:------------:|:-------------:|
-| Qwen2-Audio-chat         |    3.69     |    3.40     |    35.35     |    35.43     |    49.01     |    22.57     |     98.85     |
-| Baichuan-Audio           |    4.00     |    3.39     |    49.64     |    48.80     |    63.30     |    41.32     |     86.73     |
-| GLM-4-Voice              |    4.06     |    3.48     |    43.31     |    40.11     |    52.97     |    24.91     |     88.08     |
-| Kimi-Audio               |    4.46     | <b>3.97</b> | <b>63.12</b> |    62.17     | <b>83.52</b> | <b>61.10</b> | <b>100.00</b> |
-| Qwen2.5-Omni             | <b>4.49</b> |    3.93     |    55.71     | <b>61.32</b> |    81.10     |    52.87     |     99.42     |
-| Ming-Lite-Omni-Preview   |    4.25     |    3.88     |    58.95     |    46.06     |    60.00     |    46.71     |     96.53     |
-</div>
-#### ASR
-<div align="center">
-| **Model**               | **Aishell-1** | **Aishell-2 ios** | **Wenetspeech test-net** | **Wenet test-meeting** | **Librispeech test-clean** | **Librispeech test-other** |
-|:------------------------|:-------------:|:-----------------:|:------------------------:|:----------------------:|:--------------------------:|:--------------------------:|
-| Whisper Large-v3        |     5.14      |       4.76        |           9.68           |         18.54          |            1.9             |            3.65            |
-| Qwen2-Audio             |     1.53      |       3.06        |           7.72           |          8.4           |         <b>1.6</b>         |            3.6             |
-| GLM-4-voice Base        |     2.46      |         -         |            -             |           -            |            2.82            |            7.66            |
-| Baichuan-Omni-1.5       |       -       |         -         |           6.9            |          8.4           |             -              |             -              |
-| Qwen2.5-Omni            |  <b>1.18</b>  |    <b>2.36</b>    |        <b>5.9</b>        |          7.7           |            1.8             |         <b>3.4</b>         |
-| Ming-Lite-Omni-Preview  |     1.62      |       2.82        |           6.23           |       <b>6.9</b>       |            2.34            |            5.74            |
-</div>
-### Knowledge
-<div align="center">
-| Model                     | InfoSeek_H-mean | InfoSeek_unseen_question | InfoSeek_unseen_entity |
-|:--------------------------|:---------------:|:------------------------:|:----------------------:|
-| GPT-4o                    |  <b>36.05</b>   |            -             |           -            |
-| PaLI-X                    |      22.06      |           23.5           |          20.8          |
-| Qwen2.5-vl-32B            |      19.35      |          20.55           |         18.28          |
-| Ming-Lite-Omni-Preview    |      27.3       |           28.9           |          25.9          |
-</div>
-### OCR&GUI
-<div align="center">
-| Model              | Ming-Lite-Omni-Preview | Qwen2.5-VL-7B-Instruct |
-|:-------------------|:----------------------:|:----------------------:|
-| ChartQA_TEST       |          85.2          |      <b>87.3</b>       |
-| DocVQA_TEST        |          93.2          |      <b>95.7</b>       |
-| OCRBenchV2_en/zh   |       52.2/51.6        |    <b>56.3/57.2</b>    |
-| OmniDocBench↓      |       34.7/34.5        |    <b>30.8/39.8</b>    |
-| TextVQA_VAL        |         82.36          |      <b>84.9</b>       |
-| ScreenSpot         |          79.3          |      <b>84.7</b>       |
-</div>
-## Model Sources
-- **Github Repository:** https://github.com/inclusionAI/Ming

+# Ming-Lite-Omni
+<p align="center">
+    <img src="./figures/ant-bailing.png" width="100"/>
+<p>
+<p align="center">📑 <a href="https://github.com/inclusionAI/Ming">Technical Report</a>｜📖<a href="https://lucaria-academy.github.io/Ming-Omni/">Project Page</a> ｜🤗 <a href="https://huggingface.co/inclusionAI/Ming-Lite-Omni">Hugging Face</a>｜ 🤖 <a href="https://www.modelscope.cn/models/inclusionAI/Ming-Lite-Omni">ModelScope</a>｜
+## Introduction
+Ming-lite-omni, a light version of Ming-omni, which is derived from [Ling-lite](https://github.com/inclusionAI/Ling) and features 2.8 billion activated parameter. Ming-lite-omni is a unified multimodal model capable of processing images, text, audio, and video, while demonstrating strong proficiency in both speech and image generation. Ming-lite-omni employs dedicated encoders to extract tokens from different modalities, which are then processed by Ling, an MoE architecture equipped with newly proposed modality-specific routers. This design enables a single model to efficiently process and fuse multimodal inputs within a unified framework, thereby facilitating diverse tasks without requiring separate models, task-specific fine-tuning, or structural redesign. Importantly, Ming-lite-omni extends beyond conventional multimodal models by supporting audio and image generation. This is achieved through the integration of an advanced audio decoder for natural-sounding speech and Ming-Lite-Uni for high-quality image generation, which also allow the model to engage in context-aware chatting, perform text-to-speech conversion, and conduct versatile image editing. Our experimental results showcase Ming-lite-omni offers a powerful solution for unified perception and generation across all modalities.
+Notably, Ming-lite-omni is the first open-source model we are aware of to match GPT-4o in modality support, and we release all code and model weights to encourage further research and development in the community.
+<p align="center">
+    <img src="./figures/ming.png" width="800"/>
+<p>
+## 📌 Updates
+[//]: # (* [2025.05.28] 🔥 Our [Technical Report]&#40;https://arxiv.org/pdf/2505.02471&#41; is in public on arxiv.)
+* [2025.05.28] 🔥 The official version of Ming-lite-omni is released, with better performance and image generation support.
+* [2025.05.04] 🔥 We release the test version of Ming-lite-omni：[Ming-lite-omni-Preview](https://github.com/inclusionAI/Ming/tree/Ming-Lite-Omni-Preview).
+## Key Features
+- **Unified Omni-Modality Perception**: Ming-lite-omni, built on [Ling](https://github.com/inclusionAI/Ling), an MoE architecture LLM, resolves task conflicts and ensures coherent integration of tokens from different modalities through modality-specific routers.
+- **Unified Perception and Generation**: Ming-lite-omni achieves unified understanding and generation, enabling the model to interpret multimodal instructions and user intent during generation, which helps enhance generation quality and improves usability across multiple tasks.
+- **Innovative Generation Capabilities**: Ming-lite-omni can perceive all modalities and generate high-quality text, real-time speech, and vivid images simultaneously, delivering exceptional cross-modal performance across diverse tasks including image perception, audio-visual interaction, and image generation.
+##  Evaluation
+Ming-lite-omni delivers exceptional cross-modal performance, as validated across image perception, audio-visual interaction, and image generation tasks. Specifically, in the image perception task, Ming-lite-omni attained performance comparable to that of Qwen2.5-VL-7B by activating only 2.8B parameters. It delivers superior performance in end-to-end speech understanding and instruction following, surpassing Qwen2.5-Omni and Kimi-Audio. It also supports native-resolution image generation, editing, and style transfer, achieving a GenEval score of 0.64, outperforming mainstream models such as SDXL. In terms of FID, Ming-lite-omni reaches 4.85, setting a new SOTA across existing methods.
+<p align="center">
+    <img src="./figures/performance.png" width="800"/>
+<p>
+### Image benchmark
+<div align="center">
+| Benchmarks        | Ming-lite-omni |    Qwen2.5-VL-7B-Instruct    | InternVL2.5-8B-MPO |
+|:------------------|:--------------:|:----------------------------:|:------------------:|
+| AI2D              |      83.1      |             84.4             |    <b>84.5</b>     |
+| HallusionBench    |  <b>55.0</b>   |             55.8             |        51.7        |
+| MMBench_TEST_V11  |      80.8      |         <b>82.8</b>          |        82.0        |
+| MMMU              |      56.3      |         <b>56.6</b>          |        54.8        |
+| MMStar            |      64.7      |             65.3             |    <b>65.2</b>     |
+| MMVet             |      71.3      |             71.6             |        68.1        |
+| MathVista         |  <b>71.6</b>   |             68.1             |        67.9        |
+| OCRBench          |  <b>88.4</b>   |             87.8             |        88.2        |
+| Average           |      71.4      |         <b>71.5</b>          |        70.3        |
+</div>
+#### Encyclopedia Benchmarks
+<div align="center">
+| Object Recognition   | Ming-lite-omni |  Qwen2.5-VL-7B-Instruct  |
+|:---------------------|:--------------:|:------------------------:|
+| Plants               |   **54.96**    |           47.8           |
+| Animals              |    **56.7**    |          50.85           |
+| Vehicles             |     41.91      |        **42.29**         |
+| Food & Ingredients   |   **62.28**    |          54.09           |
+| Dishes               |    **44.3**    |          39.07           |
+| General              |     91.08      |        **92.42**         |
+| Average              |   **58.54**    |          54.43           |
+</div>
+### Video benchmark
+<div align="center">
+| Benchmarks              | Ming-lite-omni | Qwen2.5VL-7B-Instruct |
+|:------------------------|:--------------:|:---------------------:|
+| VideoMME                |      67.0      |      <b>67.3</b>      |
+| MVBench                 |      67.7      |      <b>67.4</b>      |
+| Video-MMMU              |      46.3      |      <b>47.4</b>      |
+| LongVideoBench          |      56.6      |         54.7          |
+| Average                 |  <b>59.4</b>   |         59.2          |
+</div>
+Note: All models are evaluated based on 128 uniformly sampled frames.
+### Audio benchmark
+#### SpeechQA
+<div align="center">
+| Model            |    Average    | AlpacaEval  | CommonEval  |    SD-QA     |     MMSU     |  OpenBookQA  |    IFEval    |   AdvBench    |
+|:-----------------|:-------------:|:-----------:|:-----------:|:------------:|:------------:|:------------:|:------------:|:-------------:|
+| Qwen2-Audio-chat |     3.545     |    3.69     |    3.40     |    35.35     |    35.43     |    49.01     |    22.57     |     98.85     |
+| Baichuan-Audio   |     3.695     |    4.00     |    3.39     |    49.64     |    48.80     |    63.30     |    41.32     |     86.73     |
+| GLM-4-Voice      |     3.77      |    4.06     |    3.48     |    43.31     |    40.11     |    52.97     |    24.91     |     88.08     |
+| Kimi-Audio       |     4.215     |    4.46     |    3.97     | <b>63.12</b> | <b>62.17</b> | <b>83.52</b> | <b>61.10</b> | <b>100.00</b> |
+| Qwen2.5-Omni     |     4.21      |    4.49     |    3.93     |    55.71     |    61.32     |    81.10     |    52.87     |     99.42     |
+| Ming-lite-omni   |  <b>4.34</b>  | <b>4.63</b> | <b>4.06</b> |    58.84     |    47.53     |    61.98     |    58.36     |     99.04     |
+</div>
+#### ASR
+<div align="center">
+|     Model      | aishell1 | aishell2_android | aishell2_ios | cv15_zh  | fleurs_zh | wenetspeech_meeting | wenetspeech_net | librispeech_test_clean | librispeech_test_other | multilingual_librispeech | cv15_en  | fleurs_en |  voxpopuli_v1.0_en   |
+|:--------------:|:--------:|:----------------:|:------------:|:--------:|:---------:|:-------------------:|:---------------:|:----------------------:|:----------------------:|:------------------------:|:--------:|:---------:|:--------------------:|
+| Ming-lite-omni |   1.47   |     **2.55**     |   **2.52**   |   6.31   |   2.96    |        5.95         |      5.46       |          1.44          |          2.80          |         **4.15**         | **6.89** | **3.39**  |       **5.80**       |
+|  Qwen2.-Omni   |   1.18   |       2.75       |     2.63     | **5.20** |   3.00    |      **5.90**       |      7.70       |          1.80          |          3.40          |           7.56           |   7.60   |   4.10    |       **5.80**       |
+|  Qwen2-Audio   |   1.53   |       2.92       |     2.92     |   6.90   |   7.50    |        7.16         |      8.42       |          1.60          |          3.60          |           5.40           |   8.60   |   6.90    |         6.84         |
+|   Kimi-Audio   | **0.60** |       2.64       |     2.56     |   7.21   | **2.69**  |        6.28         |    **5.37**     |        **1.28**        |        **2.42**        |           5.88           |  10.31   |   4.44    |         7.97         |
+</div>
+### Information-Seeking Benchmark
+<div align="center">
+| Model          | InfoSeek_H-mean | InfoSeek_unseen_question | InfoSeek_unseen_entity |
+|:---------------|:---------------:|:------------------------:|:----------------------:|
+| GPT-4o         |  <b>36.05</b>   |            -             |           -            |
+| PaLI-X         |      22.06      |           23.5           |          20.8          |
+| Qwen2.5-vl-32B |      19.35      |          20.55           |         18.28          |
+| Ming-lite-omni |      27.7       |         **30.4**         |        **25.4**        |
+</div>
+### OCR
+<div align="center">
+| Model              | Ming-lite-omni | Qwen2.5-VL-7B-Instruct  |
+|:-------------------|:--------------:|:-----------------------:|
+| ChartQA_TEST       |      85.1      |       <b>87.3</b>       |
+| DocVQA_TEST        |       93       |       <b>95.7</b>       |
+| OCRBenchV2_en/zh   |    53.3/52     |    <b>56.3/57.2</b>     |
+| OmniDocBench↓      | 34/<b>34.4</b> |    <b>30.8</b>/39.8     |
+| TextVQA_VAL        |      82.8      |       <b>84.9</b>       |
+</div>
+### GUI
+<div align="center">
+| Model                      | Ming-lite-omni | InternVL3 8B | Qwen2.5-VL-7B-Instruct |
+|:---------------------------|:--------------:|:------------:|:----------------------:|
+| ScreenSpot                 |  <b>82.1</b>   |     79.5     |         78.9*          |
+| ScreenSpot-V2              |  <b>84.1</b>   |     81.4     |           -            |
+| AITZ(EM)                   |  <b>66.6</b>   |      -       |         57.6*          |
+</div>
+Note: * denotes the reproduced results.
+### Unified Generation Benchmark
+<div align="center">
+| Model          | single_object | two_object |  counting  |  colors  | position | color_attr | GENEVAL  | DPGBench  |     FID↓      |
+|:---------------|:-------------:|:----------:|:----------:|:--------:|:--------:|:----------:|:--------:|:---------:|:-------------:|
+| Ming-lite-omni |  **0.9875**   | **0.7727** | **0.6812** |  0.7872  |   0.31   |    0.29    | **0.64** |   81.72   |   **4.85**    |
+| Metaquery-XL   |       -       |     -      |     -      |    -     |    -     |     -      |   0.61   | **82.05** |     6.02      |
+| SDv2.1         |     0.98      |    0.51    |    0.44    | **0.85** |   0.07   |    0.17    |   0.50   |   68.09   |     26.96     |
+| Emu3-Gen       |     0.98      |    0.71    |    0.34    |   0.81   |   0.17   |    0.21    |   0.54   |   80.60   |       -       |
+| SDXL           |     0.98      |    0.74    |    0.39    | **0.85** |   0.15   |    0.23    |   0.55   |   74.65   |     8.76      |
+| Janus          |     0.97      |    0.68    |    0.30    |   0.84   | **0.46** |  **0.42**  |   0.61   |   79.68   |     10.10     |
+| JanusFlow      |       -       |     -      |     -      |    -     |    -     |     -      |   0.63   |   80.09   |     9.51      |
+</div>
+Please refer to our technical report for more comprehensive evaluation results.
 ## Model Downloads
 <div align="center">
+| **Model**      |   **Input modality**    | **Oput modality** |                                                                     **Download**                                                                     |
+|:---------------| :---------------------: | :---------------: |:----------------------------------------------------------------------------------------------------------------------------------------------------:|
+| Ming-Lite-Omni | Image,text,viedio,audio | Image,text,audio  | [🤗 HuggingFace](https://huggingface.co/inclusionAI/Ming-Lite-Omni) <br>[🤖 ModelScope](https://www.modelscope.cn/models/inclusionAI/Ming-Lite-Omni) |
 </div>
+If you're in mainland China, we strongly recommend you to download our model from 🤖 <a href="https://www.modelscope.cn/models/inclusionAI/Ming-Lite-Omni">ModelScope</a>.
+## Use Cases
+Additional demonstration cases are available on our project [page](https://lucaria-academy.github.io/Ming-Omni/).
+## Example Usage
+Please download our model following [Model Downloads](#model-downloads), then you can refer to the following codes to run Ming-lite-omni model.
+Python environment dependency installation.
+```shell
+pip install -r requirements.txt
+pip install data/matcha_tts-0.0.5.1-cp38-cp38-linux_x86_64.whl
+pip install diffusers==0.33.0
+pip install nvidia-cublas-cu12==12.4.5.8  # for H20
+```
 ```python
 import os
+import torch
+from transformers import AutoProcessor, GenerationConfig
 from modeling_bailingmm import BailingMMNativeForConditionalGeneration
 # build model
 model = BailingMMNativeForConditionalGeneration.from_pretrained(
+    "inclusionAI/Ming-Lite-Omni",
     torch_dtype=torch.bfloat16,
     low_cpu_mem_usage=True
 ).to("cuda")
 assets_path = YOUR_ASSETS_PATH
 # build processor
+processor = AutoProcessor.from_pretrained("inclusionAI/Ming-Lite-Omni", trust_remote_code=True)
 ```
 ```python
         inputs[k] = inputs[k].to(dtype=torch.bfloat16)
 # call generate
+generation_config = GenerationConfig.from_dict({'no_repeat_ngram_size': 10})
 generated_ids = model.generate(
     **inputs,
     max_new_tokens=512,
+    use_cache=True,
     eos_token_id=processor.gen_terminator,
+    generation_config=generation_config,
 )
 generated_ids_trimmed = [
         out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
 print(output_text)
 ```
+### Audio tasks
 ```python
 # ASR
         ],
     },
 ]
+# we use whisper encoder for ASR task, so need modify code above
+inputs = processor(
+    text=[text],
+    images=image_inputs,
+    videos=video_inputs,
+    audios=audio_inputs,
+    return_tensors="pt",
+    audio_kwargs={'use_whisper_encoder': True}
+)
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=512,
+    use_cache=True,
+    eos_token_id=processor.gen_terminator,
+    generation_config=generation_config,
+    use_whisper_encoder=True
+)
 ```
 ```python
     {
         "role": "HUMAN",
         "content": [
+            {"type": "audio", "audio": 'data/wavs/speechQA_sample.wav'},
         ],
     },
 ]
+generation_config = GenerationConfig.from_dict({
+    'output_hidden_states': True,
+    'return_dict_in_generate': True,
+    'no_repeat_ngram_size': 10}
+)
+outputs = model.generate(
+    **inputs,
+    max_new_tokens=512,
+    use_cache=True,
+    eos_token_id=processor.gen_terminator,
+    generation_config=generation_config,
+    use_whisper_encoder=False
+)
+generated_ids = outputs.sequences
+generated_ids_trimmed = [
+    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+# speechQA result
+output_text = processor.batch_decode(
+    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)[0]
+# for TTS
+from modeling_bailing_talker import AudioDetokenizer
+model_name_or_path = model.config._name_or_path
+audio_detokenizer = AudioDetokenizer(
+    f'{model_name_or_path}/talker/audio_detokenizer.yaml',
+    flow_model_path=f'{model_name_or_path}/talker/flow.pt',
+    hifigan_model_path=f'{model_name_or_path}/talker/hift.pt'
+)
+spk_input = torch.load('data/spks/luna.pt')
+thinker_reply_part = outputs.hidden_states[0][0] + outputs.hidden_states[0][-1]
+# Setting thinker_reply_part to None allows the talker to operate as a standalone TTS model, independent of the language model.
+audio_tokens = model.talker.omni_audio_generation(
+    output_text,
+    thinker_reply_part=thinker_reply_part, **spk_input)
+waveform = audio_detokenizer.token2wav(audio_tokens, save_path='out.wav', **spk_input)
+```
+For detailed usage for ASR, SpeechQA, and TTS tasks, please refer to `test_audio_tasks.py`
+### Image Generation & Edit
+Ming-omni natively supports image generation and image editing. To use this function, you only need to add the corresponding parameters in the generate function.
+```python
+# Image generation mode currently limits the range of input pixels.
+gen_input_pixels = 451584
+processor.max_pixels = gen_input_pixels
+processor.min_pixels = gen_input_pixels
+def generate(messages, processor, model, **image_gen_param):
+    text = processor.apply_chat_template(messages, add_generation_prompt=True)
+    image_inputs, video_inputs, audio_inputs = processor.process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        audios=audio_inputs,
+        return_tensors="pt",
+    ).to(model.device)
+    for k in inputs.keys():
+        if k == "pixel_values" or k == "pixel_values_videos" or k == "audio_feats":
+            inputs[k] = inputs[k].to(dtype=torch.bfloat16)
+    print(image_gen_param)
+    image = model.generate(
+        **inputs,
+        image_gen=True,
+        **image_gen_param,
+    )
+    return image
+```
+Text-to-image
+```python
+messages = [
+    {
+        "role": "HUMAN",
+        "content": [
+            {"type": "text", "text": "Draw a girl with short hair."},
+        ],
+    }
+]
+image = generate(
+   messages=messages, processor=processor, model=model,
+   image_gen_cfg=6.0, image_gen_steps=20, image_gen_width=480, image_gen_height=544
+)
+image.save("./t2i.jpg")
+```
+Edit
+```python
+messages = [
+    {
+        "role": "HUMAN",
+        "content": [
+            {"type": "image", "image": "samples/cake.jpg"},
+            {"type": "text", "text": "add a candle on top of the cake"},
+        ],
+    }
+]
+image = generate(
+   messages=messages, processor=processor, model=model,
+   image_gen_cfg=6.0, image_gen_steps=20, image_gen_width=512, image_gen_height=512
+)
+image.save("./edit.jpg")
+```
+## License and Legal Disclaimer
+This code repository is licensed under the [MIT License](../LICENSE), and the Legal Disclaimer is located in the [LEGAL.md file](../LEGAL.md) under the project's root directory.

audio_detokenizer/cli/model.py CHANGED Viewed

@@ -36,9 +36,9 @@ class AudioDetokenizerModel:
         self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
         self.hift.to(self.device).eval()
-    def inference(self, flow_embedding, tts_speech_token,
-                  flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
-                  prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32), is_en=False):
         torch.cuda.synchronize()
         t0 = time.time()
@@ -48,11 +48,11 @@ class AudioDetokenizerModel:
         tts_mel = self.flow.inference(token=tts_speech_token.to(self.device),
                                       token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device),
-                                      prompt_token=flow_prompt_speech_token.to(self.device),
-                                      prompt_token_len=flow_prompt_speech_token_len.to(self.device),
                                       prompt_feat=prompt_speech_feat.to(self.device),
                                       prompt_feat_len=prompt_speech_feat_len.to(self.device),
-                                      embedding=flow_embedding.to(self.device).to(self.dtype)).float()
         torch.cuda.synchronize()
         tts_speech = self.hift.inference(mel=tts_mel).cpu()

         self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
         self.hift.to(self.device).eval()
+    def inference(self, vp_emb, tts_speech_token,
+                  prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
+                  prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32), is_en=False, **kwargs):
         torch.cuda.synchronize()
         t0 = time.time()
         tts_mel = self.flow.inference(token=tts_speech_token.to(self.device),
                                       token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device),
+                                      prompt_token=prompt_speech_token.to(self.device),
+                                      prompt_token_len=prompt_speech_token_len.to(self.device),
                                       prompt_feat=prompt_speech_feat.to(self.device),
                                       prompt_feat_len=prompt_speech_feat_len.to(self.device),
+                                      embedding=vp_emb.to(self.device).to(self.dtype)).float()
         torch.cuda.synchronize()
         tts_speech = self.hift.inference(mel=tts_mel).cpu()

bailingmm_utils.py CHANGED Viewed

@@ -170,7 +170,7 @@ def sample_frames(num_frames, total_frames, sample="random"):
                 padded_frame_indices = [frame_indices[-1]] * num_frames
                 padded_frame_indices[:len(frame_indices)] = frame_indices
                 frame_indices = padded_frame_indices
-        elif sample == "uniform":
             frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
             if len(frame_indices) < num_frames:
                 frame_indices = [
@@ -278,12 +278,75 @@ def _read_video_decord(
     frame_indices = sample_frames(
         num_frames=num_frames, total_frames=total_frames, sample=sample_method
     )
-    video = vr.get_batch(frame_indices).asnumpy()
     video = torch.tensor(video).permute(0, 3, 1, 2)  # Convert to TCHW format
     sample_fps = num_frames / max(total_frames, 1e-6) * video_fps
     return video, sample_fps
 VIDEO_READER_BACKENDS = {
     "decord": _read_video_decord,
     "torchvision": _read_video_torchvision,
@@ -410,6 +473,19 @@ def extract_vision_info(conversations: list[dict] | list[list[dict]]) -> list[di
                             or ele["type"] in ("image", "image_url", "video")
                     ):
                         vision_infos.append(ele)
     return vision_infos
 def process_vision_info(

                 padded_frame_indices = [frame_indices[-1]] * num_frames
                 padded_frame_indices[:len(frame_indices)] = frame_indices
                 frame_indices = padded_frame_indices
+        elif sample == "uniform" or sample == "adaptive":
             frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
             if len(frame_indices) < num_frames:
                 frame_indices = [
     frame_indices = sample_frames(
         num_frames=num_frames, total_frames=total_frames, sample=sample_method
     )
+    if sample_method == "adaptive" and len(frame_indices) > 64:
+        frames_indices_selected = select_frames_based_on_query(vr, frame_indices, ele)  # query的扩模态采样结果
+        indices = np.linspace(0, len(frame_indices) - 1, len(frame_indices)//2, dtype=int)
+        frame_indices = np.array(frame_indices)[indices].tolist()
+        frames_indices_selected_sort = np.sort(frame_indices + frames_indices_selected[:(num_frames - len(frame_indices))].tolist()).tolist()
+        video = vr.get_batch(frames_indices_selected_sort).asnumpy()
+    else:
+        video = vr.get_batch(frame_indices).asnumpy()
+    # video = vr.get_batch(frame_indices).asnumpy()
     video = torch.tensor(video).permute(0, 3, 1, 2)  # Convert to TCHW format
     sample_fps = num_frames / max(total_frames, 1e-6) * video_fps
     return video, sample_fps
+def select_frames_based_on_query(vr, frame_indices, ele):
+    import sys
+    sys.path.join("./longvu")
+    '''
+    This LongVU model (https://github.com/Vision-CAIR/LongVU) computes cross-modal relevance
+    between user queries and video frames for the purpose of frame selection.
+    It can also be replaced with other text/visual encoders to achieve the same effect.
+    To maintain consistency in the repository structure, this module has not been included in the repository directory for now.
+    If needed for evaluation, simply import this module.
+    '''
+    from longvu.constants import (
+        DEFAULT_IMAGE_TOKEN,
+        IMAGE_TOKEN_INDEX,
+    )
+    from longvu.conversation import conv_templates, SeparatorStyle
+    from longvu.mm_datautils import (
+        KeywordsStoppingCriteria,
+        process_images,
+        tokenizer_image_token,
+    )
+    tokenizer, model, image_processor = ele["tokenizer"], ele["model"], ele["image_processor"]
+    # 考虑在这里扩展frame_indices
+    video = vr.get_batch(frame_indices).asnumpy()  # (21, 320, 568, 3)
+    image_sizes = [video[0].shape[:2]]  # [(320, 568)]
+    video = process_images(video, image_processor, model.config)  # len(video)=2, 第一个 torch.Size([623, 3, 384, 384])，第二个 torch.Size([623, 3, 378, 378])
+    video = [item.unsqueeze(0) for item in video] # len(video)=2, 第一个 torch.Size([1, 623, 3, 384, 384])，第二个 torch.Size([1, 623, 3, 378, 378])
+    qs = DEFAULT_IMAGE_TOKEN + "\n" + ele["text"]
+    conv = conv_templates["qwen"].copy()
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(model.device)  # torch.Size([1, 26])
+    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2  # '<|im_end|>'
+    keywords = [stop_str]
+    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+    with torch.inference_mode():
+        output_ids = model.generate(
+            input_ids,
+            images=video,
+            image_sizes=image_sizes,
+            do_sample=False,
+            temperature=0.2,
+            max_new_tokens=128,
+            use_cache=True,
+            stopping_criteria=[stopping_criteria],
+        )  # torch.Size([1, 128])
+    selected_indices = np.array(frame_indices)[output_ids.cpu().numpy()]
+    return selected_indices
 VIDEO_READER_BACKENDS = {
     "decord": _read_video_decord,
     "torchvision": _read_video_torchvision,
                             or ele["type"] in ("image", "image_url", "video")
                     ):
                         vision_infos.append(ele)
+                    # 把视频的 query_text 也加进来
+                    if "text" in ele: text = ele["text"]
+                    if "video" in ele and ele["sample"] == "adaptive":
+                        tokenizer = ele["tokenizer"]
+                        model = ele["model"]
+                        image_processor = ele["image_processor"]
+    for ele in vision_infos:
+        if "video" in ele and ele["sample"] == "adaptive":
+            ele["text"] = text
+            ele["tokenizer"] = tokenizer
+            ele["model"] = model
+            ele["image_processor"] = image_processor
+    return vision_infos
     return vision_infos
 def process_vision_info(

data/matcha_tts-0.0.5.1-cp310-cp310-linux_x86_64.whl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e316b130dc66906d71bcb6bdc0d9c43c860901204cb64a4a3adb903dc1edfe90
+size 576473

data/spks/luna.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f485f3180000a68aac70482d3e543429abe24866b79d8431630953c313f634c8
-size 1953

 version https://git-lfs.github.com/spec/v1
+oid sha256:22f9a6b446e0849d4233ac25d11ebf7e96b30746e16482890a6367c8a3b6e05f
+size 32094

figures/ant-bailing.png ADDED Viewed

Git LFS Details

SHA256: 206b4b04b4bcc0b35e6fd394da1bc203cc55ce4facce8f2d39b322bd820e374a
Pointer size: 130 Bytes
Size of remote file: 58.7 kB

figures/cases/animal.png ADDED Viewed

Git LFS Details

SHA256: d51829d4a607bd95404851d27ee7ea0d87f9f75b5cba6c0016255363903c1abf
Pointer size: 130 Bytes
Size of remote file: 37.5 kB

figures/cases/audioqa_audio.wav ADDED Viewed

Binary file (64.8 kB). View file

figures/cases/audioqa_video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74b65d9bec7f83e487b7f923076c01d476dd2ef7ed83928a696ab6f88c7751b7
+size 776184

figures/cases/demo_430v2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1cf17a700173885c60c3b5fb79a13618b5075e5fb8781f98f4c67118c3933b3
+size 18945212

figures/cases/document.png ADDED Viewed

Git LFS Details

SHA256: fcf51f8771e4892035bb83a4b12f29dad87b916af8e5c050b68b3c662fbf3049
Pointer size: 131 Bytes
Size of remote file: 293 kB

figures/cases/document_parse.png ADDED Viewed

Git LFS Details

SHA256: 22eb791c58d8cd6e2bf354a2a4a31a758053990598f055e1d63a977596985080
Pointer size: 130 Bytes
Size of remote file: 68.9 kB

figures/cases/grounding1.png ADDED Viewed

Git LFS Details

SHA256: 385e449be9a5eb7d1bda277eb22a93b44ef51f9ab6ad9ff0adfa29754bb8eede
Pointer size: 131 Bytes
Size of remote file: 143 kB

figures/cases/grounding1_vis.png ADDED Viewed

Git LFS Details

SHA256: bd3551d542acf252a1d1e0be8273776c74d3034e067b6fcde80fe5a448e52cbd
Pointer size: 131 Bytes
Size of remote file: 144 kB

figures/cases/grounding2.png ADDED Viewed

Git LFS Details

SHA256: 44cadc4f181122cbbbe0708112371f6ab03e97493342978b03c4048657242ad7
Pointer size: 130 Bytes
Size of remote file: 18.1 kB

figures/cases/grounding2_vis.png ADDED Viewed

Git LFS Details

SHA256: cf3f44b74ff432f04838d7e39caa1da7f32479df3160ee657b9899654f3e4d85
Pointer size: 130 Bytes
Size of remote file: 19.9 kB

figures/cases/gui.png ADDED Viewed

Git LFS Details

SHA256: 3a476aef2a2f99b9d97f81b534a695f2456737719da43688b3a0ee95f7269181
Pointer size: 132 Bytes
Size of remote file: 1.12 MB

figures/cases/knowledge1.png ADDED Viewed

Git LFS Details

SHA256: 2d0771f2bffbc3421c651305fb6fa331af139f8d017ca0ec2ee5240a47d74b0b
Pointer size: 130 Bytes
Size of remote file: 57.8 kB

figures/cases/knowledge2.png ADDED Viewed

Git LFS Details

SHA256: 6704773a94f90480047a21020571632cd0225550e7787bc9c544ff08af19f444
Pointer size: 129 Bytes
Size of remote file: 8.05 kB

figures/cases/ocr.png ADDED Viewed

Git LFS Details

SHA256: 2bd9ecff1d709d4fb02b807032d3d1999a781faa2e9ca74902a85e0a0230cff8
Pointer size: 131 Bytes
Size of remote file: 658 kB

figures/cases/plant.png ADDED Viewed

Git LFS Details

SHA256: e6bfe9b71741d79cb7c76a56b740542b27295796070333939771c9bcdca68147
Pointer size: 130 Bytes
Size of remote file: 15.9 kB

figures/cases/reasoning.png ADDED Viewed

Git LFS Details

SHA256: c0f1d2fdd6cf36e5921df03be8f1c0479ff3e6c4c4b30b6ad434c726b10129b2
Pointer size: 129 Bytes
Size of remote file: 5.46 kB

figures/cases/s2s.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5fd492db166436757f6774e0341245f05a24e2c81ab792112816a19b5e239589
+size 963109

figures/ming.png ADDED Viewed

Git LFS Details

SHA256: bcda355a5d4af40f4305f735ea1a23e6caced67857b7a1db39419de1625bfb02
Pointer size: 131 Bytes
Size of remote file: 462 kB

figures/performance.png ADDED Viewed

Git LFS Details

SHA256: 6ea7df2370500a45bfaa67eb71283be4962617d2441e2c73bac2dac2f86b1aaa
Pointer size: 132 Bytes
Size of remote file: 1.24 MB

figures/unified_samples.png ADDED Viewed

Git LFS Details

SHA256: 8a7900334befc75198d677100cfb8a06029d24b228d09cfac4e0f8492c0f7779
Pointer size: 132 Bytes
Size of remote file: 9.8 MB

model-00001-of-00016.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9ffca90426a130f5e8224dd036112dc3c77ed70bd18a68a74ac5bfab6205552f
 size 4991703896

 version https://git-lfs.github.com/spec/v1
+oid sha256:ad46ac358a4008f04a1a30cc7aaa380ea9bb768c8a95fb056a3d0adbbdaa1fc0
 size 4991703896

model-00002-of-00016.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ca5c2c024f8be519d2d5ce5befdbf46a0a0ea65b6bfc09fb3a8a1185b4939e49
 size 5000002488

 version https://git-lfs.github.com/spec/v1
+oid sha256:9af70387993676a39ded151d93765ea1207b21ea370df24a9e37b5bfdb115cc1
 size 5000002488

model-00015-of-00016.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:72bf5838538c529304129e1c37eaf405955c12bfc219d3890388e0322d08ea23
 size 4987365648

 version https://git-lfs.github.com/spec/v1
+oid sha256:2502c41990fddac43223d09f378918252fa4e7fce71120e9effbd9eb7e38e84a
 size 4987365648

model-00016-of-00016.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8d6a32acb61e65e81692ceb36be95ea583c7b86c2051c4e78c47e391e690e381
 size 1360124632

 version https://git-lfs.github.com/spec/v1
+oid sha256:b55e94be9b6e05036a5405bf15e7dcdb05ba498272d7d8fe9074dfb40e22fd84
 size 1360124632

modeling_bailing_talker.py CHANGED Viewed

@@ -141,9 +141,10 @@ class BailingTalkerForConditionalGeneration(PreTrainedModel):
             logits=logits,
         )
-    def sample(self, logits, topk=20, filter_value=-float("Inf")):
         logits = logits.reshape(1, -1)  # [1, V]
         indices_to_remove = logits < torch.topk(logits, topk)[0][..., -1, None]
         logits[indices_to_remove] = filter_value
         token_id = torch.multinomial(torch.softmax(logits, dim=-1), num_samples=1).to(torch.long)
         return token_id
@@ -161,23 +162,33 @@ class BailingTalkerForConditionalGeneration(PreTrainedModel):
         thinker_length,
         vp_emb=None,
         thinker_reply_part=None,
     ):
         text_input_part = self.tokenizer.encode(tts_text)
         # audio_prefix and text_prefix for first step generation
         talker_text_prefix = (
-                prompt +
-                prefix_from_thinker +
-                vp +
-                text_input_part[:1]
         )
         # the rest of input_text
         talker_text_input_part = (
-            text_input_part[1:] +
             self.tokenizer.encode("<text_eos>") +
             self.tokenizer.encode("<text_pad>")
         )
         talker_text_prefix = torch.tensor(talker_text_prefix).reshape(1, -1).to(self.device)
@@ -192,6 +203,7 @@ class BailingTalkerForConditionalGeneration(PreTrainedModel):
             thinker_reply_part=thinker_reply_part,
             thinker_reply_length=torch.tensor([thinker_length]).to(self.device),
             thinker_prefix_insert_loc=torch.tensor([len(prompt) + 1]).to(self.device) if thinker_reply_part is not None else None,
         )
         audio_token = [ele - len(self.tokenizer) for ele in audio_token]
@@ -257,6 +269,9 @@ class BailingTalkerForConditionalGeneration(PreTrainedModel):
         vp_emb=None,
         thinker_reply_part=None,
         max_length=50,
     ):
         # thinker_reply_part: [B, T, d]
@@ -302,6 +317,8 @@ class BailingTalkerForConditionalGeneration(PreTrainedModel):
                 thinker_length=thinker_length,
                 vp_emb=vp_emb,
                 thinker_reply_part=thinker_reply_part,
             )
             audio_tokens.append(audio_tokens_piece)
         return audio_tokens
@@ -319,10 +336,13 @@ class BailingTalkerForConditionalGeneration(PreTrainedModel):
         thinker_reply_part: Optional[torch.FloatTensor] = None,
         thinker_reply_length: Optional[torch.FloatTensor] = None,
         thinker_prefix_insert_loc: Optional[torch.LongTensor] = None,
     ):
         result = []
         step = 0
         eos_id = self.tokenizer.encode("<audio_eos>")[0]
         while step < 1000:
             if step == 0:
                 talker_audio_input_ids = talker_audio_prefix
@@ -356,10 +376,16 @@ class BailingTalkerForConditionalGeneration(PreTrainedModel):
             # 采样
             logits = outputs.logits[:, -1, :]
-            next_token = self.sample(logits)
             if next_token.item() == eos_id:
                 break
-            result.append(next_token.item())
             step += 1
         return result
@@ -374,13 +400,14 @@ class AudioDetokenizer:
         self.model.load(flow_model_path, hifigan_model_path)
         self.sr = 22050
-    def token2wav(self, audio_tokens, flow_embedding, save_path=None):
         assert isinstance(audio_tokens, list), f"audio_tokens should be list"
         speech_list = []
         for audio_token in audio_tokens:
-            model_input = {"tts_speech_token": audio_token,
-                        'flow_embedding': flow_embedding}
-            model_output = self.model.inference(**model_input)
             silent_dur = 0.02
             silent_tensor = torch.Tensor([0.0] * int(self.sr * silent_dur))

             logits=logits,
         )
+    def sample(self, logits, topk=20, filter_value=-float("Inf"), stopping_criteria=False, eos_id=151666):
         logits = logits.reshape(1, -1)  # [1, V]
         indices_to_remove = logits < torch.topk(logits, topk)[0][..., -1, None]
+        indices_to_remove[0][eos_id] = True if stopping_criteria is True else indices_to_remove[0][eos_id]
         logits[indices_to_remove] = filter_value
         token_id = torch.multinomial(torch.softmax(logits, dim=-1), num_samples=1).to(torch.long)
         return token_id
         thinker_length,
         vp_emb=None,
         thinker_reply_part=None,
+        prompt_text=None,
+        prompt_speech_token=None,
     ):
         text_input_part = self.tokenizer.encode(tts_text)
+        prompt_text_input_part = self.tokenizer.encode(prompt_text)
+        prompt_speech_token = prompt_speech_token[0].tolist()
+        prompt_speech_token_bpe = self.s3bpe_tokenizer.encode(prompt_speech_token)[0]
+        prompt_speech_token_bpe = (torch.tensor(prompt_speech_token_bpe) + len(self.tokenizer) ).tolist()
         # audio_prefix and text_prefix for first step generation
         talker_text_prefix = (
+            prompt +
+            prefix_from_thinker +
+            vp +
+            prompt_text_input_part[:1]
         )
         # the rest of input_text
         talker_text_input_part = (
+            prompt_text_input_part[1:] +
+            text_input_part +
             self.tokenizer.encode("<text_eos>") +
             self.tokenizer.encode("<text_pad>")
         )
         talker_text_prefix = torch.tensor(talker_text_prefix).reshape(1, -1).to(self.device)
             thinker_reply_part=thinker_reply_part,
             thinker_reply_length=torch.tensor([thinker_length]).to(self.device),
             thinker_prefix_insert_loc=torch.tensor([len(prompt) + 1]).to(self.device) if thinker_reply_part is not None else None,
+            prompt_wav_token=prompt_speech_token_bpe,
         )
         audio_token = [ele - len(self.tokenizer) for ele in audio_token]
         vp_emb=None,
         thinker_reply_part=None,
         max_length=50,
+        prompt_text=None,
+        prompt_speech_token=None,
+        **kwargs,
     ):
         # thinker_reply_part: [B, T, d]
                 thinker_length=thinker_length,
                 vp_emb=vp_emb,
                 thinker_reply_part=thinker_reply_part,
+                prompt_text=prompt_text,
+                prompt_speech_token=prompt_speech_token,
             )
             audio_tokens.append(audio_tokens_piece)
         return audio_tokens
         thinker_reply_part: Optional[torch.FloatTensor] = None,
         thinker_reply_length: Optional[torch.FloatTensor] = None,
         thinker_prefix_insert_loc: Optional[torch.LongTensor] = None,
+        prompt_wav_token: List = [],
+        min_new_token = 10,
     ):
         result = []
         step = 0
         eos_id = self.tokenizer.encode("<audio_eos>")[0]
+        prompt_wav_token_len = len(prompt_wav_token)
         while step < 1000:
             if step == 0:
                 talker_audio_input_ids = talker_audio_prefix
             # 采样
             logits = outputs.logits[:, -1, :]
+            stopping_criteria = position_ids.item() < prompt_wav_token_len + min_new_token
+            next_token = self.sample(logits, stopping_criteria=stopping_criteria )
             if next_token.item() == eos_id:
                 break
+            if len(prompt_wav_token) > 0:
+                next_token = torch.tensor([[prompt_wav_token[0]]]).to(logits.device)
+                prompt_wav_token = prompt_wav_token[1:]
+            else:
+                result.append(next_token.item())
             step += 1
         return result
         self.model.load(flow_model_path, hifigan_model_path)
         self.sr = 22050
+    def token2wav(self, audio_tokens, save_path=None, **kwargs):
         assert isinstance(audio_tokens, list), f"audio_tokens should be list"
         speech_list = []
         for audio_token in audio_tokens:
+            model_input = {"tts_speech_token": audio_token}
+            kwargs.update(**model_input)
+            model_output = self.model.inference(**kwargs)
             silent_dur = 0.02
             silent_tensor = torch.Tensor([0.0] * int(self.sr * silent_dur))

modeling_bailingmm.py CHANGED Viewed

@@ -5,6 +5,7 @@
 import copy
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -495,9 +496,18 @@ class BailingMMNativeForConditionalGeneration(PreTrainedModel):
         from diffusion.sana_loss import SANALoss
         import os
         from safetensors.torch import load_file
-        temp_state_dict = load_file(os.path.join(inference_model_path, 'mlp', 'model.safetensors'))
         self.query_tokens_dict = nn.ParameterDict()
         self.img_gen_scales = [4, 8, 16]
         for scale in self.img_gen_scales:
@@ -535,7 +545,7 @@ class BailingMMNativeForConditionalGeneration(PreTrainedModel):
         self.diffusion_loss.to(self.model.device)
         #self.norm_query_embeds = True
         # load connector
-        self.connector = AutoModelForCausalLM.from_pretrained(os.path.join(inference_model_path, 'connector'))
         for layer in self.connector.model.layers:
             layer.self_attn.is_causal = False
         self.connector.to(self.model.device)
@@ -555,4 +565,19 @@ class BailingMMNativeForConditionalGeneration(PreTrainedModel):
         self.proj_out.load_state_dict(modified_state_dict_out, strict=True)
         self.proj_in.to(self.model.device)
         self.proj_out.to(self.model.device)
-        self.loaded_image_gen_modules = True

 import copy
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
+import os
 import numpy as np
 import torch
         from diffusion.sana_loss import SANALoss
         import os
         from safetensors.torch import load_file
+        if os.path.exists(inference_model_path):
+            temp_state_dict = load_file(os.path.join(inference_model_path, 'mlp', 'model.safetensors'))
+        else:
+            from huggingface_hub import hf_hub_download
+            from safetensors import safe_open
+            safetensors_path = hf_hub_download(
+                repo_id=inference_model_path,
+                filename="model.safetensors",
+                subfolder="mlp"
+            )
+            with safe_open(safetensors_path, framework="pt") as f:
+                temp_state_dict = {key: f.get_tensor(key) for key in f.keys()}
         self.query_tokens_dict = nn.ParameterDict()
         self.img_gen_scales = [4, 8, 16]
         for scale in self.img_gen_scales:
         self.diffusion_loss.to(self.model.device)
         #self.norm_query_embeds = True
         # load connector
+        self.connector = AutoModelForCausalLM.from_pretrained(inference_model_path, subfolder='connector')
         for layer in self.connector.model.layers:
             layer.self_attn.is_causal = False
         self.connector.to(self.model.device)
         self.proj_out.load_state_dict(modified_state_dict_out, strict=True)
         self.proj_in.to(self.model.device)
         self.proj_out.to(self.model.device)
+        self.loaded_image_gen_modules = True
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        **kwargs,
+    ):
+        model = super().from_pretrained(
+            pretrained_model_name_or_path,
+            *model_args,
+            **kwargs,
+        )
+        model.load_image_gen_modules(pretrained_model_name_or_path)
+        return model

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+torch==2.1.2
+torchvision==0.16.2
+torchaudio==2.1.2
+funasr==1.1.14
+flash-attn==2.3.6
+peft==0.13.2
+diffusers #==0.33
+tokenizers==0.20.3
+transformers==4.45.0
+grouped_gemm==0.1.4
+decord==0.6.0
+hyperpyyaml
+modelscope
+onnxruntime
+inflect
+conformer
+diffusers
+lightning
+gdown
+openai-whisper==20240930

test_audio_tasks.py CHANGED Viewed

@@ -94,9 +94,9 @@ class BailingMMInfer:
         if self.model.talker is not None and output_audio:
             thinker_reply_part = outputs.hidden_states[0][0] + outputs.hidden_states[0][-1]
-            spk_embed = self.spk_info.get(speaker, 'luna')
-            audio_tokens = self.model.talker.omni_audio_generation(output_text, vp_emb=spk_embed, thinker_reply_part=thinker_reply_part)
-            waveform = self.audio_detokenizer.token2wav(audio_tokens, spk_embed, save_path=output_audio_path)
             return output_text, waveform
         return output_text

         if self.model.talker is not None and output_audio:
             thinker_reply_part = outputs.hidden_states[0][0] + outputs.hidden_states[0][-1]
+            spk_input = self.spk_info.get(speaker, 'luna')
+            audio_tokens = self.model.talker.omni_audio_generation(output_text, thinker_reply_part=thinker_reply_part, **spk_input)
+            waveform = self.audio_detokenizer.token2wav(audio_tokens, save_path=output_audio_path, **spk_input)
             return output_text, waveform
         return output_text