lihongjie
commited on
Commit
·
f66b0a3
1
Parent(s):
9a83bfa
update
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +55 -0
- .gitignore +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.float16.bin +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.float32.bin +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.npy +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.float16.bin +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.float32.bin +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.npy +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/llm_decoder.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.bfloat16.bin +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.float32.bin +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.npy +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l0_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l10_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l11_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l12_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l13_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l14_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l15_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l16_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l17_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l18_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l19_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l1_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l20_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l21_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l22_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l23_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l2_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l3_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l4_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l5_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l6_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l7_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l8_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l9_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_post.axmodel +3 -0
- README.md +139 -3
- asset/cross_lingual_prompt.wav +3 -0
- asset/dingding.png +3 -0
- asset/output.wav +3 -0
- asset/zero_shot_prompt.wav +3 -0
- frontend-onnx/campplus.onnx +3 -0
- frontend-onnx/speech_tokenizer_v2.onnx +3 -0
- main_ax650 +3 -0
- run.sh +20 -0
- scripts/CosyVoice-BlankEN/tokenizer_config.json +40 -0
- scripts/CosyVoice-BlankEN/vocab.json +0 -0
- scripts/audio.py +83 -0
- scripts/cosyvoice2_tokenizer.py +124 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,58 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l16_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l18_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l8_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.float16.bin filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l4_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_post.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.npy filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l19_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l21_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l6_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l7_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l9_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.float32.bin filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.npy filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l0_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l20_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.float16.bin filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l10_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l13_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l14_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l3_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.float32.bin filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l1_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l2_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.npy filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l15_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l23_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 63 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l5_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 64 |
+
CosyVoice-BlankEN-Ax650-prefill_512/llm_decoder.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 65 |
+
CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.float32.bin filter=lfs diff=lfs merge=lfs -text
|
| 66 |
+
CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.bfloat16.bin filter=lfs diff=lfs merge=lfs -text
|
| 67 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l11_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 68 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l12_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 69 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l17_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 70 |
+
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l22_together.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 71 |
+
asset/output.wav filter=lfs diff=lfs merge=lfs -text
|
| 72 |
+
asset/zero_shot_prompt.wav filter=lfs diff=lfs merge=lfs -text
|
| 73 |
+
asset/cross_lingual_prompt.wav filter=lfs diff=lfs merge=lfs -text
|
| 74 |
+
asset/dingding.png filter=lfs diff=lfs merge=lfs -text
|
| 75 |
+
token2wav-axmodels/flow_encoder_28.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 76 |
+
token2wav-axmodels/flow_estimator_250.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 77 |
+
token2wav-axmodels/flow.input_embedding.float32.bin filter=lfs diff=lfs merge=lfs -text
|
| 78 |
+
token2wav-axmodels/hift_58.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 79 |
+
token2wav-axmodels/hift_50_first.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 80 |
+
frontend-onnx/speech_tokenizer_v2.onnx filter=lfs diff=lfs merge=lfs -text
|
| 81 |
+
token2wav-axmodels/flow_encoder_50_final.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 82 |
+
token2wav-axmodels/flow_encoder_53.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 83 |
+
token2wav-axmodels/flow_encoder_78.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 84 |
+
token2wav-axmodels/flow_estimator_300.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 85 |
+
token2wav-axmodels/flow.input_embedding.npy filter=lfs diff=lfs merge=lfs -text
|
| 86 |
+
frontend-onnx/campplus.onnx filter=lfs diff=lfs merge=lfs -text
|
| 87 |
+
main_ax650 filter=lfs diff=lfs merge=lfs -text
|
| 88 |
+
token2wav-axmodels/flow_estimator_200.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 89 |
+
token2wav-axmodels/flow.input_embedding.float16.bin filter=lfs diff=lfs merge=lfs -text
|
| 90 |
+
scripts/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
output*.wav
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.txt
|
CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.float16.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:978e40e848cab25f75617ff943df4e0f2206456f9d801503864c5077441785f9
|
| 3 |
+
size 3584
|
CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.float32.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:edd849b97257d603c196016d0c5c8789f2883f6f3f3d3027894d1b640b82e336
|
| 3 |
+
size 7168
|
CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7b3d907bbdb6c19282fbd7cfc6b983dad9e11bfcd47065f1c468ee5748098f3
|
| 3 |
+
size 7296
|
CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.float16.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b1c06ba6c2d4960d217cc0603238007c326ee709b8c6454cb2ce7793593c86fe
|
| 3 |
+
size 11762688
|
CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.float32.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0b0eafc818e5f45139ac002ccf8894ce13ae54022da258bf81348defb639f6a0
|
| 3 |
+
size 23525376
|
CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dd1b15ba46f7a521014804cf6d370e53a2797eeb42c2e29ce37ce04938eaffb0
|
| 3 |
+
size 23525504
|
CosyVoice-BlankEN-Ax650-prefill_512/llm_decoder.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b90819939556ea19bba1900a4be4a4697e7d3b154912f459ac8d199dda566fd
|
| 3 |
+
size 6321489
|
CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.bfloat16.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2289f29d6156cc9b55020b1cdc6207b0477baa19d1f822d0ff8247ef4015c1b7
|
| 3 |
+
size 272269312
|
CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.float32.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:298a6f5cb4039f7b1dcf7bca4ba132f710afc949a6dd1bba8b2cf23fa45ebbf3
|
| 3 |
+
size 544538624
|
CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f802eeb1dbbca75062c93feba83b1e804d95d867c86fbda6586855715adc2f3b
|
| 3 |
+
size 544538752
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l0_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0aebad8d31a03657e40a082f2765dae803abe931270d565610f69b2c475b83bc
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l10_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:805216134d5b4e19d00073aa6211ce911502aabfb8be4087e244afca62f17edf
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l11_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5b09d2e98ce163f0df076413345373222c3f4853cd8e7242aed6989858a20907
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l12_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b673053276ef44f1c0703f109e73d263c9652e40aa629ba7a1396b4b545f3e5c
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l13_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fd857094ea1614fe1e5033e24f428df4cb16c78ecdfbc2b2370a762aa80b3169
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l14_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:182d7e201b1436dec547d87c1334d2182966f60b786e7a7246a0732757528bf1
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l15_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4f409277a24082ddca92b59efe5fe62ae8b5ee5fa2248fb8b467275b71369d0c
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l16_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1bd75fbfb5b955b92f2c8f86dc881a5711a178823d7234df6d303cf5c4ba563c
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l17_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:707031640b471510e54e8f44cd732110dee3d95cca6013e2c39f8b27cb59553d
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l18_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7608f861bb468f103396cd7a7d69d8b53990b4d4fd116008b228974fae94746d
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l19_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2781fe5f613cfa72356aa76f08836c88b7011f0784412b349cd0b371e096bbab
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l1_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c963b55a8483f370ed908fe2fe40ae040b299ffd9da474de00d3dea53bda04d
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l20_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a416f42098d31f72463504918b602b95c0ccffb09767474d2f78a0d6a6124bf7
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l21_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:321815f246ce90cf9b36fd5b82b193620150ca77341749df0fdf6987e93471b0
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l22_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6839b2c0838db525f8422ba3870dc4731edfddf514e24d5d472277b08a4a1b1f
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l23_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff11b7670e19c26fb25759da8e1cfc913d0615edb297cd1928c2ecdd60500a8f
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l2_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:55ccb4f87cccede314e377b4f816a539d7862992e1ba4bd0b230a320ff661abc
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l3_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c402b7518050575e78f8b0162a4a3f911d54390b3d3b29f8df833c78e0149e06
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l4_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:74f5ec27b61c8008cba08a2b3e64d8bde5716852a5d1b4a35ce45476e6c83db8
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l5_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:48823dada608acb7ddc1c2735db1bcb30c5150d972ee454d52b25c4fefbe2911
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l6_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ca1277efcc43ba1767970b773cb5f1f478d795991ea01471eee59f755d188c8
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l7_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2312ada84911e6a306846f95dd8889ede9d9e71d09cfb7cead34b87a287fea2e
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l8_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e7bdc2563f8f6efbefe121cb1b20d96166183453ce9ef79349257a3eba0aa28c
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l9_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b28414e4228a9a8459d18af962c5149292310076116b4012648ca72727c6de7f
|
| 3 |
+
size 17994026
|
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_post.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e60b8f3aba22079cbc0e71cad0ce91ad662493266f540eb46124f285a9ccc519
|
| 3 |
+
size 147957523
|
README.md
CHANGED
|
@@ -1,3 +1,139 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: mit
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
- zh
|
| 6 |
+
base_model:
|
| 7 |
+
- CosyVoice2
|
| 8 |
+
pipeline_tag: Text-to-Speech
|
| 9 |
+
library_name: transformers
|
| 10 |
+
tags:
|
| 11 |
+
- CosyVoice2
|
| 12 |
+
- Speech
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
# CosyVoice2
|
| 16 |
+
This version of CosyVoice2 has been converted to run on the Axera NPU using **w8a16** quantization.
|
| 17 |
+
Compatible with Pulsar2 version: 4.2
|
| 18 |
+
|
| 19 |
+
## Convert tools links:
|
| 20 |
+
For those who are interested in model conversion, you can try to export axmodel through the original repo :
|
| 21 |
+
[Cosyvoice](https://github.com/FunAudioLLM/CosyVoice)
|
| 22 |
+
|
| 23 |
+
[Pulsar2 Link, How to Convert LLM from Huggingface to axmodel](https://pulsar2-docs.readthedocs.io/en/latest/appendix/build_llm.html)
|
| 24 |
+
|
| 25 |
+
[AXera NPU HOST LLM Runtime](https://github.com/AXERA-TECH/Cosyvoice2.Axera)
|
| 26 |
+
|
| 27 |
+
## Support Platform
|
| 28 |
+
|
| 29 |
+
- AX650
|
| 30 |
+
- AX650N DEMO Board
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
**Speech Generation**
|
| 34 |
+
| Stage | Time |
|
| 35 |
+
|------|------|
|
| 36 |
+
| llm prefill ( input_token_num + prompt_token_num 在 [0,128 ] ) | 104 ms |
|
| 37 |
+
| llm prefill ( input_token_num + prompt_token_num 在 [128,256 ] ) | 234 ms |
|
| 38 |
+
| Decode | 21.24 token/s token/s |
|
| 39 |
+
|
| 40 |
+
## How to use
|
| 41 |
+
|
| 42 |
+
Download all files from this repository to the device
|
| 43 |
+
|
| 44 |
+
### 1. Text to Speech (Voice Cloning)
|
| 45 |
+
|
| 46 |
+
#### 1. Install python library
|
| 47 |
+
Steps 2 and 3 require the use of these Python packages. If you run Steps 2 and 3 on a PC, install them on the PC.
|
| 48 |
+
```
|
| 49 |
+
pip3 install -r scripts/requirements.txt
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
#### 2. Process Prompt Speech
|
| 53 |
+
```
|
| 54 |
+
python scripts/process_prompt.py
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
Pass parameters according to the actual situation.
|
| 58 |
+
```
|
| 59 |
+
args.add_argument('--model_dir', type=str, default="../../model_convert/pretrained_models/CosyVoice2-0.5B/")
|
| 60 |
+
args.add_argument('--wetext_dir', type=str, default="../../model_convert/pengzhendong/wetext/")
|
| 61 |
+
args.add_argument('--sample_rate', type=int, default=24000)
|
| 62 |
+
args.add_argument('--zero_shot_spk_id', type=str, default="")
|
| 63 |
+
args.add_argument('--tts_text', type=str, default="君不见黄河之水天上来,奔流到海不复回。君不见高堂明镜悲白发,朝如青丝暮成雪。")
|
| 64 |
+
args.add_argument('--prompt_text', type=str, default="希望你以后能够做的比我还好呦。")
|
| 65 |
+
args.add_argument('--prompt_speech', type=str, default="../../model_convert/asset/zero_shot_prompt.wav")
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
#### 3. Start HTTP Tokenizer Server
|
| 69 |
+
```
|
| 70 |
+
cd scripts
|
| 71 |
+
python cosyvoice2_tokenizer.py --host {your host} --port {your port}
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
#### 4. Run on AX650 Board
|
| 75 |
+
1) Moidfy the HTTP host in `run.sh`.
|
| 76 |
+
2) Copy `scripts/run.sh`, `build/install/bin/main`, `process_prompt.py 生成的文件` to AX650 Board
|
| 77 |
+
3) Run `run.sh`
|
| 78 |
+
```shell
|
| 79 |
+
root@ax650 ~/yongqiang/lhj/Cosyvoice2.Axera/cpp/src # bash run.sh
|
| 80 |
+
rm: cannot remove 'output*.wav': No such file or directory
|
| 81 |
+
[I][ Init][ 108]: LLM init start
|
| 82 |
+
[I][ Init][ 34]: connect http://10.122.86.184:12345 ok
|
| 83 |
+
bos_id: 0, eos_id: 1773
|
| 84 |
+
7% | ███ | 2 / 27 [3.11s<42.04s, 0.64 count/s] embed_selector init ok[I][ Init][ 138]: attr.axmodel_num:24
|
| 85 |
+
100% | ████████████████████████████████ | 27 / 27 [10.32s<10.32s, 2.62 count/s] init post axmodel ok,remain_cmm(7178 MB)
|
| 86 |
+
[I][ Init][ 216]: max_token_len : 1023
|
| 87 |
+
[I][ Init][ 221]: kv_cache_size : 128, kv_cache_num: 1023
|
| 88 |
+
[I][ Init][ 229]: prefill_token_num : 128
|
| 89 |
+
[I][ Init][ 233]: grp: 1, prefill_max_token_num : 1
|
| 90 |
+
[I][ Init][ 233]: grp: 2, prefill_max_token_num : 128
|
| 91 |
+
[I][ Init][ 233]: grp: 3, prefill_max_token_num : 256
|
| 92 |
+
[I][ Init][ 233]: grp: 4, prefill_max_token_num : 384
|
| 93 |
+
[I][ Init][ 233]: grp: 5, prefill_max_token_num : 512
|
| 94 |
+
[I][ Init][ 237]: prefill_max_token_num : 512
|
| 95 |
+
[I][ Init][ 249]: LLM init ok
|
| 96 |
+
[I][ Init][ 154]: Token2Wav init ok
|
| 97 |
+
[I][ main][ 273]:
|
| 98 |
+
[I][ Run][ 388]: input token num : 142, prefill_split_num : 2
|
| 99 |
+
[I][ Run][ 422]: input_num_token:128
|
| 100 |
+
[I][ Run][ 422]: input_num_token:14
|
| 101 |
+
[I][ Run][ 607]: ttft: 236.90 ms
|
| 102 |
+
[Main/Token2Wav Thread] Processing batch of 28 tokens...
|
| 103 |
+
Successfully saved audio to output_0.wav (32-bit Float PCM).
|
| 104 |
+
[Main/Token2Wav Thread] Processing batch of 53 tokens...
|
| 105 |
+
Successfully saved audio to output_1.wav (32-bit Float PCM).
|
| 106 |
+
[Main/Token2Wav Thread] Processing batch of 78 tokens...
|
| 107 |
+
Successfully saved audio to output_2.wav (32-bit Float PCM).
|
| 108 |
+
[Main/Token2Wav Thread] Processing batch of 78 tokens...
|
| 109 |
+
Successfully saved audio to output_3.wav (32-bit Float PCM).
|
| 110 |
+
[Main/Token2Wav Thread] Processing batch of 78 tokens...
|
| 111 |
+
Successfully saved audio to output_4.wav (32-bit Float PCM).
|
| 112 |
+
[Main/Token2Wav Thread] Processing batch of 78 tokens...
|
| 113 |
+
Successfully saved audio to output_5.wav (32-bit Float PCM).
|
| 114 |
+
[Main/Token2Wav Thread] Processing batch of 78 tokens...
|
| 115 |
+
Successfully saved audio to output_6.wav (32-bit Float PCM).
|
| 116 |
+
[Main/Token2Wav Thread] Processing batch of 78 tokens...
|
| 117 |
+
Successfully saved audio to output_7.wav (32-bit Float PCM).
|
| 118 |
+
[Main/Token2Wav Thread] Processing batch of 78 tokens...
|
| 119 |
+
Successfully saved audio to output_8.wav (32-bit Float PCM).
|
| 120 |
+
[Main/Token2Wav Thread] Processing batch of 78 tokens...
|
| 121 |
+
Successfully saved audio to output_9.wav (32-bit Float PCM).
|
| 122 |
+
[I][ Run][ 723]: hit eos, llm finished
|
| 123 |
+
[I][ Run][ 753]: llm finished
|
| 124 |
+
[Main/Token2Wav Thread] Buffer is empty and LLM finished. Exiting.
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
[I][ Run][ 758]: total decode tokens:271
|
| 128 |
+
[N][ Run][ 759]: hit eos,avg 21.47 token/s
|
| 129 |
+
|
| 130 |
+
Successfully saved audio to output_10.wav (32-bit Float PCM).
|
| 131 |
+
Successfully saved audio to output.wav (32-bit Float PCM).
|
| 132 |
+
|
| 133 |
+
Voice generation pipeline completed.
|
| 134 |
+
Type "q" to exit, Ctrl+c to stop current running
|
| 135 |
+
text >>
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
Output Speech:
|
| 139 |
+
[output.wav](asset/output.wav)
|
asset/cross_lingual_prompt.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:353a7715c2e4811f4045658b29d1ce67ecad5120e09de10ce890f1763aab486c
|
| 3 |
+
size 606404
|
asset/dingding.png
ADDED
|
Git LFS Details
|
asset/output.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5c7a4c3837145df17e851c177f849446036e6f541d78eb6e107ea6b9e7b07672
|
| 3 |
+
size 1067564
|
asset/zero_shot_prompt.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bd199eb7109fd6ce9943cb297e3cf350c1073af014063dfadbdc100230526243
|
| 3 |
+
size 111496
|
frontend-onnx/campplus.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
|
| 3 |
+
size 28303423
|
frontend-onnx/speech_tokenizer_v2.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d43342aa12163a80bf07bffb94c9de2e120a8df2f9917cd2f642e7f4219c6f71
|
| 3 |
+
size 496082973
|
main_ax650
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4338e14a404311a5533f2e27d17726a5142e9fe7b92016ea2e845dad9dfe8bd1
|
| 3 |
+
size 6641680
|
run.sh
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
LLM_DIR=CosyVoice-BlankEN-Ax650-prefill_512/
|
| 2 |
+
TOKEN2WAV_DIR=token2wav-axmodels/
|
| 3 |
+
|
| 4 |
+
rm output*.wav
|
| 5 |
+
./main_ax650 \
|
| 6 |
+
--template_filename_axmodel "${LLM_DIR}/qwen2_p128_l%d_together.axmodel" \
|
| 7 |
+
--token2wav_axmodel_dir $TOKEN2WAV_DIR \
|
| 8 |
+
--axmodel_num 24 \
|
| 9 |
+
--bos 0 --eos 0 \
|
| 10 |
+
--filename_tokenizer_model "http://10.122.86.184:12345" \
|
| 11 |
+
--filename_post_axmodel "${LLM_DIR}/qwen2_post.axmodel" \
|
| 12 |
+
--filename_decoder_axmodel "${LLM_DIR}/llm_decoder.axmodel" \
|
| 13 |
+
--filename_tokens_embed "${LLM_DIR}/model.embed_tokens.weight.bfloat16.bin" \
|
| 14 |
+
--filename_llm_embed "${LLM_DIR}/llm.llm_embedding.float16.bin" \
|
| 15 |
+
--filename_speech_embed "${LLM_DIR}/llm.speech_embedding.float16.bin" \
|
| 16 |
+
--continue 0 \
|
| 17 |
+
--text "君不见黄河之水天上来,奔流到海不复回。君不见高堂明镜悲白发,朝如青丝暮成雪。"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
chmod 777 output*.wav
|
scripts/CosyVoice-BlankEN/tokenizer_config.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"151643": {
|
| 5 |
+
"content": "<|endoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": false,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
},
|
| 12 |
+
"151644": {
|
| 13 |
+
"content": "<|im_start|>",
|
| 14 |
+
"lstrip": false,
|
| 15 |
+
"normalized": false,
|
| 16 |
+
"rstrip": false,
|
| 17 |
+
"single_word": false,
|
| 18 |
+
"special": true
|
| 19 |
+
},
|
| 20 |
+
"151645": {
|
| 21 |
+
"content": "<|im_end|>",
|
| 22 |
+
"lstrip": false,
|
| 23 |
+
"normalized": false,
|
| 24 |
+
"rstrip": false,
|
| 25 |
+
"single_word": false,
|
| 26 |
+
"special": true
|
| 27 |
+
}
|
| 28 |
+
},
|
| 29 |
+
"additional_special_tokens": ["<|im_start|>", "<|im_end|>"],
|
| 30 |
+
"bos_token": null,
|
| 31 |
+
"chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "<|im_end|>",
|
| 34 |
+
"errors": "replace",
|
| 35 |
+
"model_max_length": 32768,
|
| 36 |
+
"pad_token": "<|endoftext|>",
|
| 37 |
+
"split_special_tokens": false,
|
| 38 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 39 |
+
"unk_token": null
|
| 40 |
+
}
|
scripts/CosyVoice-BlankEN/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
scripts/audio.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import torch
|
| 3 |
+
import torch.utils.data
|
| 4 |
+
from librosa.filters import mel as librosa_mel_fn
|
| 5 |
+
from scipy.io.wavfile import read
|
| 6 |
+
|
| 7 |
+
MAX_WAV_VALUE = 32768.0
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def load_wav(full_path):
|
| 11 |
+
sampling_rate, data = read(full_path)
|
| 12 |
+
return data, sampling_rate
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def dynamic_range_compression(x, C=1, clip_val=1e-5):
|
| 16 |
+
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def dynamic_range_decompression(x, C=1):
|
| 20 |
+
return np.exp(x) / C
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
| 24 |
+
return torch.log(torch.clamp(x, min=clip_val) * C)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def dynamic_range_decompression_torch(x, C=1):
|
| 28 |
+
return torch.exp(x) / C
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def spectral_normalize_torch(magnitudes):
|
| 32 |
+
output = dynamic_range_compression_torch(magnitudes)
|
| 33 |
+
return output
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def spectral_de_normalize_torch(magnitudes):
|
| 37 |
+
output = dynamic_range_decompression_torch(magnitudes)
|
| 38 |
+
return output
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
mel_basis = {}
|
| 42 |
+
hann_window = {}
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
|
| 46 |
+
if torch.min(y) < -1.0:
|
| 47 |
+
print("min value is ", torch.min(y))
|
| 48 |
+
if torch.max(y) > 1.0:
|
| 49 |
+
print("max value is ", torch.max(y))
|
| 50 |
+
|
| 51 |
+
global mel_basis, hann_window # pylint: disable=global-statement
|
| 52 |
+
print("fmax",fmax)
|
| 53 |
+
if f"{str(fmax)}_{str(y.device)}" not in mel_basis:
|
| 54 |
+
mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
|
| 55 |
+
mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
|
| 56 |
+
hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
|
| 57 |
+
|
| 58 |
+
y = torch.nn.functional.pad(
|
| 59 |
+
y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
|
| 60 |
+
)
|
| 61 |
+
y = y.squeeze(1)
|
| 62 |
+
|
| 63 |
+
spec = torch.view_as_real(
|
| 64 |
+
torch.stft(
|
| 65 |
+
y,
|
| 66 |
+
n_fft,
|
| 67 |
+
hop_length=hop_size,
|
| 68 |
+
win_length=win_size,
|
| 69 |
+
window=hann_window[str(y.device)],
|
| 70 |
+
center=center,
|
| 71 |
+
pad_mode="reflect",
|
| 72 |
+
normalized=False,
|
| 73 |
+
onesided=True,
|
| 74 |
+
return_complex=True,
|
| 75 |
+
)
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
|
| 79 |
+
|
| 80 |
+
spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
|
| 81 |
+
spec = spectral_normalize_torch(spec)
|
| 82 |
+
|
| 83 |
+
return spec
|
scripts/cosyvoice2_tokenizer.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer, PreTrainedTokenizerFast
|
| 2 |
+
from http.server import HTTPServer, BaseHTTPRequestHandler
|
| 3 |
+
import json
|
| 4 |
+
import argparse
|
| 5 |
+
from tokenizer.tokenizer import get_qwen_tokenizer
|
| 6 |
+
|
| 7 |
+
class Tokenizer_Http():
|
| 8 |
+
|
| 9 |
+
def __init__(self):
|
| 10 |
+
|
| 11 |
+
self.tokenizer = get_qwen_tokenizer("CosyVoice-BlankEN", True)
|
| 12 |
+
|
| 13 |
+
def encode(self, prompt):
|
| 14 |
+
|
| 15 |
+
token_ids = self.tokenizer.encode(prompt, allowed_special="all")
|
| 16 |
+
return token_ids
|
| 17 |
+
|
| 18 |
+
def decode(self, token_ids):
|
| 19 |
+
return self.tokenizer.decode(token_ids)
|
| 20 |
+
|
| 21 |
+
# @property
|
| 22 |
+
# def bos_id(self):
|
| 23 |
+
# return self.tokenizer.bos_token_id
|
| 24 |
+
|
| 25 |
+
@property
|
| 26 |
+
def eos_id(self):
|
| 27 |
+
return 1773
|
| 28 |
+
|
| 29 |
+
# @property
|
| 30 |
+
# def bos_token(self):
|
| 31 |
+
# return self.tokenizer.bos_token
|
| 32 |
+
|
| 33 |
+
@property
|
| 34 |
+
def eos_token(self):
|
| 35 |
+
return "<|eot_id|>"
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
tokenizer = Tokenizer_Http()
|
| 39 |
+
|
| 40 |
+
# print(tokenizer.bos_id, tokenizer.bos_token, tokenizer.eos_id, tokenizer.eos_token)
|
| 41 |
+
print(tokenizer.encode("hello world"))
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class Request(BaseHTTPRequestHandler):
|
| 45 |
+
#通过类继承,新定义类
|
| 46 |
+
timeout = 5
|
| 47 |
+
server_version = 'Apache'
|
| 48 |
+
|
| 49 |
+
def do_GET(self):
|
| 50 |
+
print(self.path)
|
| 51 |
+
#在新类中定义get的内容(当客户端向该服务端使用get请求时,本服务端将如下运行)
|
| 52 |
+
self.send_response(200)
|
| 53 |
+
self.send_header("type", "get") #设置响应头,可省略或设置多个
|
| 54 |
+
self.end_headers()
|
| 55 |
+
|
| 56 |
+
if self.path == '/bos_id':
|
| 57 |
+
bos_id = tokenizer.bos_id
|
| 58 |
+
# print(bos_id)
|
| 59 |
+
# to json
|
| 60 |
+
if bos_id is None:
|
| 61 |
+
msg = json.dumps({'bos_id': -1})
|
| 62 |
+
else:
|
| 63 |
+
msg = json.dumps({'bos_id': bos_id})
|
| 64 |
+
elif self.path == '/eos_id':
|
| 65 |
+
eos_id = tokenizer.eos_id
|
| 66 |
+
if eos_id is None:
|
| 67 |
+
msg = json.dumps({'eos_id': -1})
|
| 68 |
+
else:
|
| 69 |
+
msg = json.dumps({'eos_id': eos_id})
|
| 70 |
+
else:
|
| 71 |
+
msg = 'error'
|
| 72 |
+
|
| 73 |
+
print(msg)
|
| 74 |
+
msg = str(msg).encode() #转为str再转为byte格式
|
| 75 |
+
|
| 76 |
+
self.wfile.write(msg) #将byte格式的信息返回给客户端
|
| 77 |
+
|
| 78 |
+
def do_POST(self):
|
| 79 |
+
#在新类中定义post的内容(当客户端向该服务端使用post请求时,本服务端将如下运行)
|
| 80 |
+
data = self.rfile.read(int(
|
| 81 |
+
self.headers['content-length'])) #获取从客户端传入的参数(byte格式)
|
| 82 |
+
data = data.decode() #将byte格式转为str格式
|
| 83 |
+
|
| 84 |
+
self.send_response(200)
|
| 85 |
+
self.send_header("type", "post") #设置响应头,可省略或设置多个
|
| 86 |
+
self.end_headers()
|
| 87 |
+
|
| 88 |
+
if self.path == '/encode':
|
| 89 |
+
req = json.loads(data)
|
| 90 |
+
prompt = req['text']
|
| 91 |
+
|
| 92 |
+
token_ids = tokenizer.encode(prompt)
|
| 93 |
+
if token_ids is None:
|
| 94 |
+
msg = json.dumps({'token_ids': -1})
|
| 95 |
+
else:
|
| 96 |
+
msg = json.dumps({'token_ids': token_ids})
|
| 97 |
+
|
| 98 |
+
elif self.path == '/decode':
|
| 99 |
+
req = json.loads(data)
|
| 100 |
+
token_ids = req['token_ids']
|
| 101 |
+
text = tokenizer.decode(token_ids)
|
| 102 |
+
if text is None:
|
| 103 |
+
msg = json.dumps({'text': ""})
|
| 104 |
+
else:
|
| 105 |
+
msg = json.dumps({'text': text})
|
| 106 |
+
else:
|
| 107 |
+
msg = 'error'
|
| 108 |
+
print(msg)
|
| 109 |
+
msg = str(msg).encode() #转为str再转为byte格式
|
| 110 |
+
|
| 111 |
+
self.wfile.write(msg) #将byte格式的信息返回给客户端
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
if __name__ == "__main__":
|
| 115 |
+
|
| 116 |
+
args = argparse.ArgumentParser()
|
| 117 |
+
args.add_argument('--host', type=str, default='localhost')
|
| 118 |
+
args.add_argument('--port', type=int, default=12345)
|
| 119 |
+
args = args.parse_args()
|
| 120 |
+
|
| 121 |
+
host = (args.host, args.port) #设定地址与端口号,'localhost'等价于'127.0.0.1'
|
| 122 |
+
print('http://%s:%s' % host)
|
| 123 |
+
server = HTTPServer(host, Request) #根据地址端口号和新定义的类,创建服务器实例
|
| 124 |
+
server.serve_forever() #开启服务
|