update readme
Browse files
README.md
CHANGED
|
@@ -1101,6 +1101,7 @@ else:
|
|
| 1101 |
|
| 1102 |
### Audio-Only mode
|
| 1103 |
#### Mimick
|
|
|
|
| 1104 |
```python
|
| 1105 |
mimick_prompt = "Please repeat each user's speech, including voice style and speech content."
|
| 1106 |
audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
|
|
@@ -1124,16 +1125,18 @@ res = model.chat(
|
|
| 1124 |
```python
|
| 1125 |
ref_audio, _ = librosa.load('assets/demo.wav', sr=16000, mono=True) # load the reference audio
|
| 1126 |
|
| 1127 |
-
#
|
| 1128 |
-
|
| 1129 |
-
|
|
|
|
| 1130 |
|
| 1131 |
-
|
| 1132 |
-
|
| 1133 |
-
|
| 1134 |
```
|
| 1135 |
```python
|
| 1136 |
msgs = [sys_prompt, user_question]
|
|
|
|
| 1137 |
res = model.chat(
|
| 1138 |
msgs=msgs,
|
| 1139 |
tokenizer=tokenizer,
|
|
@@ -1179,7 +1182,7 @@ General Audio:
|
|
| 1179 |
Audio Caption: Summarize the main content of the audio.
|
| 1180 |
Sound Scene Tagging: Utilize one keyword to convey the audio's content or the associated scene.
|
| 1181 |
'''
|
| 1182 |
-
task_prompt = "
|
| 1183 |
audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
|
| 1184 |
|
| 1185 |
msgs = [{'role': 'user', 'content': [task_prompt,audio_input]}]
|
|
@@ -1204,19 +1207,19 @@ Speech Generation Task Prompt:
|
|
| 1204 |
# 在新闻中,一个年轻男性兴致勃勃地说:“祝福亲爱的祖国母亲美丽富强!”他用低音调和低音量,慢慢地说出了这句话。
|
| 1205 |
# Delighting in a surprised tone, an adult male with low pitch and low volume comments:"One even gave my little dog a biscuit" This dialogue takes place at a leisurely pace, delivering a sense of excitement and surprise in the context.
|
| 1206 |
|
| 1207 |
-
Voice Cloning or Voice
|
| 1208 |
'''
|
| 1209 |
# Human Instruction-to-Speech:
|
| 1210 |
-
task_prompt = '' #Try to make some Human Instruction-to-Speech prompt
|
| 1211 |
-
msgs = [{'role': 'user', 'content': [task_prompt]}] # you can try to
|
| 1212 |
|
| 1213 |
-
# Voice Cloning mode:
|
| 1214 |
# sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='voice_cloning', language='en')
|
| 1215 |
# text_prompt = f"Please read the text below."
|
| 1216 |
# user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]} # using same voice in sys_prompt to read the text. (Voice Cloning)
|
| 1217 |
# user_question = {'role': 'user', 'content': [text_prompt, librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # using same voice in sys_prompt to read 'xxx.wav'. (Voice Creation)
|
|
|
|
| 1218 |
|
| 1219 |
-
msgs = [sys_prompt, user_question]
|
| 1220 |
res = model.chat(
|
| 1221 |
msgs=msgs,
|
| 1222 |
tokenizer=tokenizer,
|
|
|
|
| 1101 |
|
| 1102 |
### Audio-Only mode
|
| 1103 |
#### Mimick
|
| 1104 |
+
- In this task, you can see the models end-to-end ability. MiniCPM-o 2.6 takes an audio input and produces both an automatic speech recognition (ASR) transcription and a voice imitation (TTS) output.
|
| 1105 |
```python
|
| 1106 |
mimick_prompt = "Please repeat each user's speech, including voice style and speech content."
|
| 1107 |
audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
|
|
|
|
| 1125 |
```python
|
| 1126 |
ref_audio, _ = librosa.load('assets/demo.wav', sr=16000, mono=True) # load the reference audio
|
| 1127 |
|
| 1128 |
+
# Choose the mode you want to use
|
| 1129 |
+
# Audio RolePlay: # With this mode, model will role-play the character based on the audio prompt. (More human-like conversation but unstable)
|
| 1130 |
+
# sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_roleplay', language='en')
|
| 1131 |
+
# user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]}
|
| 1132 |
|
| 1133 |
+
Audio Assistant: # With this mode, model will speak with the voice in ref_audio as a AI assistant. (Stable and more suitable for general conversation)
|
| 1134 |
+
sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_assistant', language='en')
|
| 1135 |
+
user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # Try to ask something by recording it in 'xxx.wav'!!!
|
| 1136 |
```
|
| 1137 |
```python
|
| 1138 |
msgs = [sys_prompt, user_question]
|
| 1139 |
+
# round one
|
| 1140 |
res = model.chat(
|
| 1141 |
msgs=msgs,
|
| 1142 |
tokenizer=tokenizer,
|
|
|
|
| 1182 |
Audio Caption: Summarize the main content of the audio.
|
| 1183 |
Sound Scene Tagging: Utilize one keyword to convey the audio's content or the associated scene.
|
| 1184 |
'''
|
| 1185 |
+
task_prompt = "" # Choose the task prompt above
|
| 1186 |
audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
|
| 1187 |
|
| 1188 |
msgs = [{'role': 'user', 'content': [task_prompt,audio_input]}]
|
|
|
|
| 1207 |
# 在新闻中,一个年轻男性兴致勃勃地说:“祝福亲爱的祖国母亲美丽富强!”他用低音调和低音量,慢慢地说出了这句话。
|
| 1208 |
# Delighting in a surprised tone, an adult male with low pitch and low volume comments:"One even gave my little dog a biscuit" This dialogue takes place at a leisurely pace, delivering a sense of excitement and surprise in the context.
|
| 1209 |
|
| 1210 |
+
Voice Cloning or Voice Conversion: With this mode, model will act like a TTS model.
|
| 1211 |
'''
|
| 1212 |
# Human Instruction-to-Speech:
|
| 1213 |
+
task_prompt = '' #Try to make some Human Instruction-to-Speech prompt (Voice Creation)
|
| 1214 |
+
msgs = [{'role': 'user', 'content': [task_prompt]}] # you can also try to ask the same audio question
|
| 1215 |
|
| 1216 |
+
# Voice Cloning mode:
|
| 1217 |
# sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='voice_cloning', language='en')
|
| 1218 |
# text_prompt = f"Please read the text below."
|
| 1219 |
# user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]} # using same voice in sys_prompt to read the text. (Voice Cloning)
|
| 1220 |
# user_question = {'role': 'user', 'content': [text_prompt, librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # using same voice in sys_prompt to read 'xxx.wav'. (Voice Creation)
|
| 1221 |
+
# msgs = [sys_prompt, user_question]
|
| 1222 |
|
|
|
|
| 1223 |
res = model.chat(
|
| 1224 |
msgs=msgs,
|
| 1225 |
tokenizer=tokenizer,
|