lihongjie commited on Sep 5

Commit

f66b0a3

1 Parent(s): 9a83bfa

update

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +55 -0
.gitignore +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.float16.bin +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.float32.bin +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.npy +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.float16.bin +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.float32.bin +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.npy +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/llm_decoder.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.bfloat16.bin +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.float32.bin +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.npy +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l0_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l10_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l11_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l12_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l13_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l14_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l15_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l16_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l17_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l18_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l19_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l1_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l20_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l21_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l22_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l23_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l2_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l3_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l4_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l5_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l6_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l7_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l8_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l9_together.axmodel +3 -0
CosyVoice-BlankEN-Ax650-prefill_512/qwen2_post.axmodel +3 -0
README.md +139 -3
asset/cross_lingual_prompt.wav +3 -0
asset/dingding.png +3 -0
asset/output.wav +3 -0
asset/zero_shot_prompt.wav +3 -0
frontend-onnx/campplus.onnx +3 -0
frontend-onnx/speech_tokenizer_v2.onnx +3 -0
main_ax650 +3 -0
run.sh +20 -0
scripts/CosyVoice-BlankEN/tokenizer_config.json +40 -0
scripts/CosyVoice-BlankEN/vocab.json +0 -0
scripts/audio.py +83 -0
scripts/cosyvoice2_tokenizer.py +124 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,58 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l16_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l18_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l8_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.float16.bin filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l4_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_post.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.npy filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l19_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l21_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l6_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l7_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l9_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.float32.bin filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.npy filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l0_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l20_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.float16.bin filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l10_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l13_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l14_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l3_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.float32.bin filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l1_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l2_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.npy filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l15_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l23_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l5_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/llm_decoder.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.float32.bin filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.bfloat16.bin filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l11_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l12_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l17_together.axmodel filter=lfs diff=lfs merge=lfs -text
+CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l22_together.axmodel filter=lfs diff=lfs merge=lfs -text
+asset/output.wav filter=lfs diff=lfs merge=lfs -text
+asset/zero_shot_prompt.wav filter=lfs diff=lfs merge=lfs -text
+asset/cross_lingual_prompt.wav filter=lfs diff=lfs merge=lfs -text
+asset/dingding.png filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/flow_encoder_28.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/flow_estimator_250.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/flow.input_embedding.float32.bin filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/hift_58.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/hift_50_first.axmodel filter=lfs diff=lfs merge=lfs -text
+frontend-onnx/speech_tokenizer_v2.onnx filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/flow_encoder_50_final.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/flow_encoder_53.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/flow_encoder_78.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/flow_estimator_300.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/flow.input_embedding.npy filter=lfs diff=lfs merge=lfs -text
+frontend-onnx/campplus.onnx filter=lfs diff=lfs merge=lfs -text
+main_ax650 filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/flow_estimator_200.axmodel filter=lfs diff=lfs merge=lfs -text
+token2wav-axmodels/flow.input_embedding.float16.bin filter=lfs diff=lfs merge=lfs -text
+scripts/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+output*.wav
+__pycache__/
+*.txt

CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.float16.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:978e40e848cab25f75617ff943df4e0f2206456f9d801503864c5077441785f9
+size 3584

CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.float32.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edd849b97257d603c196016d0c5c8789f2883f6f3f3d3027894d1b640b82e336
+size 7168

CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7b3d907bbdb6c19282fbd7cfc6b983dad9e11bfcd47065f1c468ee5748098f3
+size 7296

CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.float16.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1c06ba6c2d4960d217cc0603238007c326ee709b8c6454cb2ce7793593c86fe
+size 11762688

CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.float32.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b0eafc818e5f45139ac002ccf8894ce13ae54022da258bf81348defb639f6a0
+size 23525376

CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd1b15ba46f7a521014804cf6d370e53a2797eeb42c2e29ce37ce04938eaffb0
+size 23525504

CosyVoice-BlankEN-Ax650-prefill_512/llm_decoder.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b90819939556ea19bba1900a4be4a4697e7d3b154912f459ac8d199dda566fd
+size 6321489

CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.bfloat16.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2289f29d6156cc9b55020b1cdc6207b0477baa19d1f822d0ff8247ef4015c1b7
+size 272269312

CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.float32.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:298a6f5cb4039f7b1dcf7bca4ba132f710afc949a6dd1bba8b2cf23fa45ebbf3
+size 544538624

CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f802eeb1dbbca75062c93feba83b1e804d95d867c86fbda6586855715adc2f3b
+size 544538752

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l0_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0aebad8d31a03657e40a082f2765dae803abe931270d565610f69b2c475b83bc
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l10_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:805216134d5b4e19d00073aa6211ce911502aabfb8be4087e244afca62f17edf
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l11_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b09d2e98ce163f0df076413345373222c3f4853cd8e7242aed6989858a20907
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l12_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b673053276ef44f1c0703f109e73d263c9652e40aa629ba7a1396b4b545f3e5c
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l13_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd857094ea1614fe1e5033e24f428df4cb16c78ecdfbc2b2370a762aa80b3169
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l14_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:182d7e201b1436dec547d87c1334d2182966f60b786e7a7246a0732757528bf1
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l15_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f409277a24082ddca92b59efe5fe62ae8b5ee5fa2248fb8b467275b71369d0c
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l16_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1bd75fbfb5b955b92f2c8f86dc881a5711a178823d7234df6d303cf5c4ba563c
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l17_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:707031640b471510e54e8f44cd732110dee3d95cca6013e2c39f8b27cb59553d
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l18_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7608f861bb468f103396cd7a7d69d8b53990b4d4fd116008b228974fae94746d
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l19_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2781fe5f613cfa72356aa76f08836c88b7011f0784412b349cd0b371e096bbab
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l1_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c963b55a8483f370ed908fe2fe40ae040b299ffd9da474de00d3dea53bda04d
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l20_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a416f42098d31f72463504918b602b95c0ccffb09767474d2f78a0d6a6124bf7
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l21_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:321815f246ce90cf9b36fd5b82b193620150ca77341749df0fdf6987e93471b0
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l22_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6839b2c0838db525f8422ba3870dc4731edfddf514e24d5d472277b08a4a1b1f
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l23_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff11b7670e19c26fb25759da8e1cfc913d0615edb297cd1928c2ecdd60500a8f
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l2_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55ccb4f87cccede314e377b4f816a539d7862992e1ba4bd0b230a320ff661abc
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l3_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c402b7518050575e78f8b0162a4a3f911d54390b3d3b29f8df833c78e0149e06
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l4_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74f5ec27b61c8008cba08a2b3e64d8bde5716852a5d1b4a35ce45476e6c83db8
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l5_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48823dada608acb7ddc1c2735db1bcb30c5150d972ee454d52b25c4fefbe2911
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l6_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ca1277efcc43ba1767970b773cb5f1f478d795991ea01471eee59f755d188c8
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l7_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2312ada84911e6a306846f95dd8889ede9d9e71d09cfb7cead34b87a287fea2e
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l8_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7bdc2563f8f6efbefe121cb1b20d96166183453ce9ef79349257a3eba0aa28c
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l9_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b28414e4228a9a8459d18af962c5149292310076116b4012648ca72727c6de7f
+size 17994026

CosyVoice-BlankEN-Ax650-prefill_512/qwen2_post.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e60b8f3aba22079cbc0e71cad0ce91ad662493266f540eb46124f285a9ccc519
+size 147957523

README.md CHANGED Viewed

@@ -1,3 +1,139 @@
----
-license: mit
----

+---
+license: mit
+language:
+- en
+- zh
+base_model:
+- CosyVoice2
+pipeline_tag: Text-to-Speech
+library_name: transformers
+tags:
+- CosyVoice2
+- Speech
+---
+# CosyVoice2
+This version of CosyVoice2 has been converted to run on the Axera NPU using **w8a16** quantization.
+Compatible with Pulsar2 version: 4.2
+## Convert tools links:
+For those who are interested in model conversion, you can try to export axmodel through the original repo :
+[Cosyvoice](https://github.com/FunAudioLLM/CosyVoice)
+[Pulsar2 Link, How to Convert LLM from Huggingface to axmodel](https://pulsar2-docs.readthedocs.io/en/latest/appendix/build_llm.html)
+[AXera NPU HOST LLM Runtime](https://github.com/AXERA-TECH/Cosyvoice2.Axera)
+## Support Platform
+- AX650
+  - AX650N DEMO Board
+**Speech Generation**
+| Stage | Time |
+|------|------|
+| llm prefill ( input_token_num + prompt_token_num 在 [0,128 ] ) | 104 ms  |
+| llm prefill ( input_token_num + prompt_token_num 在 [128,256 ] ) | 234 ms  |
+| Decode  |  21.24 token/s token/s |
+## How to use
+Download all files from this repository to the device
+### 1. Text to Speech (Voice Cloning)
+#### 1. Install python library
+Steps 2 and 3 require the use of these Python packages. If you run Steps 2 and 3 on a PC, install them on the PC.
+```
+pip3 install -r scripts/requirements.txt
+```
+#### 2. Process Prompt Speech
+```
+python scripts/process_prompt.py
+```
+Pass parameters according to the actual situation.
+```
+args.add_argument('--model_dir', type=str, default="../../model_convert/pretrained_models/CosyVoice2-0.5B/")
+args.add_argument('--wetext_dir', type=str, default="../../model_convert/pengzhendong/wetext/")
+args.add_argument('--sample_rate', type=int, default=24000)
+args.add_argument('--zero_shot_spk_id', type=str, default="")
+args.add_argument('--tts_text', type=str, default="君不见黄河之水天上来，奔流到海不复回。君不见高堂明镜悲白发，朝如青丝暮成雪。")
+args.add_argument('--prompt_text', type=str, default="希望你以后能够做的比我还好呦。")
+args.add_argument('--prompt_speech', type=str, default="../../model_convert/asset/zero_shot_prompt.wav")
+```
+#### 3. Start HTTP Tokenizer Server
+```
+cd scripts
+python cosyvoice2_tokenizer.py --host {your host} --port {your port}
+```
+#### 4. Run on AX650 Board
+1) Moidfy the HTTP host in `run.sh`.
+2) Copy `scripts/run.sh`, `build/install/bin/main`, `process_prompt.py 生成的文件` to AX650 Board
+3) Run `run.sh`
+```shell
+root@ax650 ~/yongqiang/lhj/Cosyvoice2.Axera/cpp/src # bash run.sh
+rm: cannot remove 'output*.wav': No such file or directory
+[I][                            Init][ 108]: LLM init start
+[I][                            Init][  34]: connect http://10.122.86.184:12345 ok
+bos_id: 0, eos_id: 1773
+  7% | ███                               |   2 /  27 [3.11s<42.04s, 0.64 count/s] embed_selector init ok[I][                            Init][ 138]: attr.axmodel_num:24
+100% | ████████████████████████████████ |  27 /  27 [10.32s<10.32s, 2.62 count/s] init post axmodel ok,remain_cmm(7178 MB)
+[I][                            Init][ 216]: max_token_len : 1023
+[I][                            Init][ 221]: kv_cache_size : 128, kv_cache_num: 1023
+[I][                            Init][ 229]: prefill_token_num : 128
+[I][                            Init][ 233]: grp: 1, prefill_max_token_num : 1
+[I][                            Init][ 233]: grp: 2, prefill_max_token_num : 128
+[I][                            Init][ 233]: grp: 3, prefill_max_token_num : 256
+[I][                            Init][ 233]: grp: 4, prefill_max_token_num : 384
+[I][                            Init][ 233]: grp: 5, prefill_max_token_num : 512
+[I][                            Init][ 237]: prefill_max_token_num : 512
+[I][                            Init][ 249]: LLM init ok
+[I][                            Init][ 154]: Token2Wav init ok
+[I][                            main][ 273]:
+[I][                             Run][ 388]: input token num : 142, prefill_split_num : 2
+[I][                             Run][ 422]: input_num_token:128
+[I][                             Run][ 422]: input_num_token:14
+[I][                             Run][ 607]: ttft: 236.90 ms
+[Main/Token2Wav Thread] Processing batch of 28 tokens...
+Successfully saved audio to output_0.wav (32-bit Float PCM).
+[Main/Token2Wav Thread] Processing batch of 53 tokens...
+Successfully saved audio to output_1.wav (32-bit Float PCM).
+[Main/Token2Wav Thread] Processing batch of 78 tokens...
+Successfully saved audio to output_2.wav (32-bit Float PCM).
+[Main/Token2Wav Thread] Processing batch of 78 tokens...
+Successfully saved audio to output_3.wav (32-bit Float PCM).
+[Main/Token2Wav Thread] Processing batch of 78 tokens...
+Successfully saved audio to output_4.wav (32-bit Float PCM).
+[Main/Token2Wav Thread] Processing batch of 78 tokens...
+Successfully saved audio to output_5.wav (32-bit Float PCM).
+[Main/Token2Wav Thread] Processing batch of 78 tokens...
+Successfully saved audio to output_6.wav (32-bit Float PCM).
+[Main/Token2Wav Thread] Processing batch of 78 tokens...
+Successfully saved audio to output_7.wav (32-bit Float PCM).
+[Main/Token2Wav Thread] Processing batch of 78 tokens...
+Successfully saved audio to output_8.wav (32-bit Float PCM).
+[Main/Token2Wav Thread] Processing batch of 78 tokens...
+Successfully saved audio to output_9.wav (32-bit Float PCM).
+[I][                             Run][ 723]: hit eos, llm finished
+[I][                             Run][ 753]: llm finished
+[Main/Token2Wav Thread] Buffer is empty and LLM finished. Exiting.
+[I][                             Run][ 758]: total decode tokens:271
+[N][                             Run][ 759]: hit eos,avg 21.47 token/s
+Successfully saved audio to output_10.wav (32-bit Float PCM).
+Successfully saved audio to output.wav (32-bit Float PCM).
+Voice generation pipeline completed.
+Type "q" to exit, Ctrl+c to stop current running
+text >>
+```
+Output Speech：
+[output.wav](asset/output.wav)

asset/cross_lingual_prompt.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:353a7715c2e4811f4045658b29d1ce67ecad5120e09de10ce890f1763aab486c
+size 606404

asset/dingding.png ADDED Viewed

Git LFS Details

SHA256: 3870bb0a4e3df1f643e09c960b7e03d80da798509c86eaa326db205236b861d5
Pointer size: 130 Bytes
Size of remote file: 96.4 kB

asset/output.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c7a4c3837145df17e851c177f849446036e6f541d78eb6e107ea6b9e7b07672
+size 1067564

asset/zero_shot_prompt.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd199eb7109fd6ce9943cb297e3cf350c1073af014063dfadbdc100230526243
+size 111496

frontend-onnx/campplus.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
+size 28303423

frontend-onnx/speech_tokenizer_v2.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d43342aa12163a80bf07bffb94c9de2e120a8df2f9917cd2f642e7f4219c6f71
+size 496082973

main_ax650 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4338e14a404311a5533f2e27d17726a5142e9fe7b92016ea2e845dad9dfe8bd1
+size 6641680

run.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+LLM_DIR=CosyVoice-BlankEN-Ax650-prefill_512/
+TOKEN2WAV_DIR=token2wav-axmodels/
+rm output*.wav
+./main_ax650 \
+--template_filename_axmodel "${LLM_DIR}/qwen2_p128_l%d_together.axmodel" \
+--token2wav_axmodel_dir $TOKEN2WAV_DIR \
+--axmodel_num 24 \
+--bos 0 --eos 0 \
+--filename_tokenizer_model "http://10.122.86.184:12345" \
+--filename_post_axmodel "${LLM_DIR}/qwen2_post.axmodel" \
+--filename_decoder_axmodel "${LLM_DIR}/llm_decoder.axmodel" \
+--filename_tokens_embed "${LLM_DIR}/model.embed_tokens.weight.bfloat16.bin" \
+--filename_llm_embed "${LLM_DIR}/llm.llm_embedding.float16.bin" \
+--filename_speech_embed "${LLM_DIR}/llm.speech_embedding.float16.bin" \
+--continue 0 \
+--text "君不见黄河之水天上来，奔流到海不复回。君不见高堂明镜悲白发，朝如青丝暮成雪。"
+chmod 777 output*.wav

scripts/CosyVoice-BlankEN/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": ["<|im_start|>", "<|im_end|>"],
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

scripts/CosyVoice-BlankEN/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/audio.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import numpy as np
+import torch
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+from scipy.io.wavfile import read
+MAX_WAV_VALUE = 32768.0
+def load_wav(full_path):
+    sampling_rate, data = read(full_path)
+    return data, sampling_rate
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+mel_basis = {}
+hann_window = {}
+def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+    global mel_basis, hann_window  # pylint: disable=global-statement
+    print("fmax",fmax)
+    if f"{str(fmax)}_{str(y.device)}" not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
+    )
+    y = y.squeeze(1)
+    spec = torch.view_as_real(
+        torch.stft(
+            y,
+            n_fft,
+            hop_length=hop_size,
+            win_length=win_size,
+            window=hann_window[str(y.device)],
+            center=center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+    )
+    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+    spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec

scripts/cosyvoice2_tokenizer.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from transformers import AutoTokenizer, PreTrainedTokenizerFast
+from http.server import HTTPServer, BaseHTTPRequestHandler
+import json
+import argparse
+from tokenizer.tokenizer import get_qwen_tokenizer
+class Tokenizer_Http():
+    def __init__(self):
+        self.tokenizer = get_qwen_tokenizer("CosyVoice-BlankEN", True)
+    def encode(self, prompt):
+        token_ids = self.tokenizer.encode(prompt, allowed_special="all")
+        return token_ids
+    def decode(self, token_ids):
+        return self.tokenizer.decode(token_ids)
+    # @property
+    # def bos_id(self):
+    #     return self.tokenizer.bos_token_id
+    @property
+    def eos_id(self):
+        return 1773
+    # @property
+    # def bos_token(self):
+    #     return self.tokenizer.bos_token
+    @property
+    def eos_token(self):
+        return "<|eot_id|>"
+tokenizer = Tokenizer_Http()
+# print(tokenizer.bos_id, tokenizer.bos_token, tokenizer.eos_id, tokenizer.eos_token)
+print(tokenizer.encode("hello world"))
+class Request(BaseHTTPRequestHandler):
+    #通过类继承，新定义类
+    timeout = 5
+    server_version = 'Apache'
+    def do_GET(self):
+        print(self.path)
+        #在新类中定义get的内容（当客户端向该服务端使用get请求时，本服务端将如下运行）
+        self.send_response(200)
+        self.send_header("type", "get")  #设置响应头，可省略或设置多个
+        self.end_headers()
+        if self.path == '/bos_id':
+            bos_id = tokenizer.bos_id
+            # print(bos_id)
+            # to json
+            if bos_id is None:
+                msg = json.dumps({'bos_id': -1})
+            else:
+                msg = json.dumps({'bos_id': bos_id})
+        elif self.path == '/eos_id':
+            eos_id = tokenizer.eos_id
+            if eos_id is None:
+                msg = json.dumps({'eos_id': -1})
+            else:
+                msg = json.dumps({'eos_id': eos_id})
+        else:
+            msg = 'error'
+        print(msg)
+        msg = str(msg).encode()  #转为str再转为byte格式
+        self.wfile.write(msg)  #将byte格式的信息返回给客户端
+    def do_POST(self):
+        #在新类中定义post的内容（当客户端向该服务端使用post请求时，本服务端将如下运行）
+        data = self.rfile.read(int(
+            self.headers['content-length']))  #获取从客户端传入的参数（byte格式）
+        data = data.decode()  #将byte格式转为str格式
+        self.send_response(200)
+        self.send_header("type", "post")  #设置响应头，可省略或设置多个
+        self.end_headers()
+        if self.path == '/encode':
+            req = json.loads(data)
+            prompt = req['text']
+            token_ids = tokenizer.encode(prompt)
+            if token_ids is None:
+                msg = json.dumps({'token_ids': -1})
+            else:
+                msg = json.dumps({'token_ids': token_ids})
+        elif self.path == '/decode':
+            req = json.loads(data)
+            token_ids = req['token_ids']
+            text = tokenizer.decode(token_ids)
+            if text is None:
+                msg = json.dumps({'text': ""})
+            else:
+                msg = json.dumps({'text': text})
+        else:
+            msg = 'error'
+        print(msg)
+        msg = str(msg).encode()  #转为str再转为byte格式
+        self.wfile.write(msg)  #将byte格式的信息返回给客户端
+if __name__ == "__main__":
+    args = argparse.ArgumentParser()
+    args.add_argument('--host', type=str, default='localhost')
+    args.add_argument('--port', type=int, default=12345)
+    args = args.parse_args()
+    host = (args.host, args.port)  #设定地址与端口号，'localhost'等价于'127.0.0.1'
+    print('http://%s:%s' % host)
+    server = HTTPServer(host, Request)  #根据地址端口号和新定义的类，创建服务器实例
+    server.serve_forever()  #开启服务