lihongjie
		
	commited on
		
		
					Commit 
							
							·
						
						f66b0a3
	
1
								Parent(s):
							
							9a83bfa
								
update
Browse filesThis view is limited to 50 files because it contains too many changes.  
							See raw diff
- .gitattributes +55 -0
- .gitignore +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.float16.bin +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.float32.bin +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.npy +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.float16.bin +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.float32.bin +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.npy +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/llm_decoder.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.bfloat16.bin +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.float32.bin +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.npy +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l0_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l10_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l11_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l12_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l13_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l14_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l15_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l16_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l17_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l18_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l19_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l1_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l20_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l21_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l22_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l23_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l2_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l3_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l4_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l5_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l6_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l7_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l8_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l9_together.axmodel +3 -0
- CosyVoice-BlankEN-Ax650-prefill_512/qwen2_post.axmodel +3 -0
- README.md +139 -3
- asset/cross_lingual_prompt.wav +3 -0
- asset/dingding.png +3 -0
- asset/output.wav +3 -0
- asset/zero_shot_prompt.wav +3 -0
- frontend-onnx/campplus.onnx +3 -0
- frontend-onnx/speech_tokenizer_v2.onnx +3 -0
- main_ax650 +3 -0
- run.sh +20 -0
- scripts/CosyVoice-BlankEN/tokenizer_config.json +40 -0
- scripts/CosyVoice-BlankEN/vocab.json +0 -0
- scripts/audio.py +83 -0
- scripts/cosyvoice2_tokenizer.py +124 -0
    	
        .gitattributes
    CHANGED
    
    | @@ -33,3 +33,58 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text | |
| 33 | 
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         | 
| 34 | 
             
            *.zst filter=lfs diff=lfs merge=lfs -text
         | 
| 35 | 
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 33 | 
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         | 
| 34 | 
             
            *.zst filter=lfs diff=lfs merge=lfs -text
         | 
| 35 | 
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         | 
| 36 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l16_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 37 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l18_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 38 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l8_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 39 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.float16.bin filter=lfs diff=lfs merge=lfs -text
         | 
| 40 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l4_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 41 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_post.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 42 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.npy filter=lfs diff=lfs merge=lfs -text
         | 
| 43 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l19_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 44 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l21_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 45 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l6_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 46 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l7_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 47 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l9_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 48 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.float32.bin filter=lfs diff=lfs merge=lfs -text
         | 
| 49 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.npy filter=lfs diff=lfs merge=lfs -text
         | 
| 50 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l0_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 51 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l20_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 52 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.float16.bin filter=lfs diff=lfs merge=lfs -text
         | 
| 53 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l10_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 54 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l13_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 55 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l14_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 56 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l3_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 57 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.float32.bin filter=lfs diff=lfs merge=lfs -text
         | 
| 58 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l1_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 59 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l2_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 60 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.npy filter=lfs diff=lfs merge=lfs -text
         | 
| 61 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l15_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 62 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l23_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 63 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l5_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 64 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/llm_decoder.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 65 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.float32.bin filter=lfs diff=lfs merge=lfs -text
         | 
| 66 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.bfloat16.bin filter=lfs diff=lfs merge=lfs -text
         | 
| 67 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l11_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 68 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l12_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 69 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l17_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 70 | 
            +
            CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l22_together.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 71 | 
            +
            asset/output.wav filter=lfs diff=lfs merge=lfs -text
         | 
| 72 | 
            +
            asset/zero_shot_prompt.wav filter=lfs diff=lfs merge=lfs -text
         | 
| 73 | 
            +
            asset/cross_lingual_prompt.wav filter=lfs diff=lfs merge=lfs -text
         | 
| 74 | 
            +
            asset/dingding.png filter=lfs diff=lfs merge=lfs -text
         | 
| 75 | 
            +
            token2wav-axmodels/flow_encoder_28.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 76 | 
            +
            token2wav-axmodels/flow_estimator_250.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 77 | 
            +
            token2wav-axmodels/flow.input_embedding.float32.bin filter=lfs diff=lfs merge=lfs -text
         | 
| 78 | 
            +
            token2wav-axmodels/hift_58.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 79 | 
            +
            token2wav-axmodels/hift_50_first.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 80 | 
            +
            frontend-onnx/speech_tokenizer_v2.onnx filter=lfs diff=lfs merge=lfs -text
         | 
| 81 | 
            +
            token2wav-axmodels/flow_encoder_50_final.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 82 | 
            +
            token2wav-axmodels/flow_encoder_53.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 83 | 
            +
            token2wav-axmodels/flow_encoder_78.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 84 | 
            +
            token2wav-axmodels/flow_estimator_300.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 85 | 
            +
            token2wav-axmodels/flow.input_embedding.npy filter=lfs diff=lfs merge=lfs -text
         | 
| 86 | 
            +
            frontend-onnx/campplus.onnx filter=lfs diff=lfs merge=lfs -text
         | 
| 87 | 
            +
            main_ax650 filter=lfs diff=lfs merge=lfs -text
         | 
| 88 | 
            +
            token2wav-axmodels/flow_estimator_200.axmodel filter=lfs diff=lfs merge=lfs -text
         | 
| 89 | 
            +
            token2wav-axmodels/flow.input_embedding.float16.bin filter=lfs diff=lfs merge=lfs -text
         | 
| 90 | 
            +
            scripts/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken filter=lfs diff=lfs merge=lfs -text
         | 
    	
        .gitignore
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            output*.wav
         | 
| 2 | 
            +
            __pycache__/
         | 
| 3 | 
            +
            *.txt
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.float16.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:978e40e848cab25f75617ff943df4e0f2206456f9d801503864c5077441785f9
         | 
| 3 | 
            +
            size 3584
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.float32.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:edd849b97257d603c196016d0c5c8789f2883f6f3f3d3027894d1b640b82e336
         | 
| 3 | 
            +
            size 7168
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/llm.llm_embedding.npy
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:a7b3d907bbdb6c19282fbd7cfc6b983dad9e11bfcd47065f1c468ee5748098f3
         | 
| 3 | 
            +
            size 7296
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.float16.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:b1c06ba6c2d4960d217cc0603238007c326ee709b8c6454cb2ce7793593c86fe
         | 
| 3 | 
            +
            size 11762688
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.float32.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:0b0eafc818e5f45139ac002ccf8894ce13ae54022da258bf81348defb639f6a0
         | 
| 3 | 
            +
            size 23525376
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/llm.speech_embedding.npy
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:dd1b15ba46f7a521014804cf6d370e53a2797eeb42c2e29ce37ce04938eaffb0
         | 
| 3 | 
            +
            size 23525504
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/llm_decoder.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:8b90819939556ea19bba1900a4be4a4697e7d3b154912f459ac8d199dda566fd
         | 
| 3 | 
            +
            size 6321489
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.bfloat16.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:2289f29d6156cc9b55020b1cdc6207b0477baa19d1f822d0ff8247ef4015c1b7
         | 
| 3 | 
            +
            size 272269312
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.float32.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:298a6f5cb4039f7b1dcf7bca4ba132f710afc949a6dd1bba8b2cf23fa45ebbf3
         | 
| 3 | 
            +
            size 544538624
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/model.embed_tokens.weight.npy
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:f802eeb1dbbca75062c93feba83b1e804d95d867c86fbda6586855715adc2f3b
         | 
| 3 | 
            +
            size 544538752
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l0_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:0aebad8d31a03657e40a082f2765dae803abe931270d565610f69b2c475b83bc
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l10_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:805216134d5b4e19d00073aa6211ce911502aabfb8be4087e244afca62f17edf
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l11_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:5b09d2e98ce163f0df076413345373222c3f4853cd8e7242aed6989858a20907
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l12_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:b673053276ef44f1c0703f109e73d263c9652e40aa629ba7a1396b4b545f3e5c
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l13_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:fd857094ea1614fe1e5033e24f428df4cb16c78ecdfbc2b2370a762aa80b3169
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l14_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:182d7e201b1436dec547d87c1334d2182966f60b786e7a7246a0732757528bf1
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l15_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:4f409277a24082ddca92b59efe5fe62ae8b5ee5fa2248fb8b467275b71369d0c
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l16_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:1bd75fbfb5b955b92f2c8f86dc881a5711a178823d7234df6d303cf5c4ba563c
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l17_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:707031640b471510e54e8f44cd732110dee3d95cca6013e2c39f8b27cb59553d
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l18_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:7608f861bb468f103396cd7a7d69d8b53990b4d4fd116008b228974fae94746d
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l19_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:2781fe5f613cfa72356aa76f08836c88b7011f0784412b349cd0b371e096bbab
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l1_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:1c963b55a8483f370ed908fe2fe40ae040b299ffd9da474de00d3dea53bda04d
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l20_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:a416f42098d31f72463504918b602b95c0ccffb09767474d2f78a0d6a6124bf7
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l21_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:321815f246ce90cf9b36fd5b82b193620150ca77341749df0fdf6987e93471b0
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l22_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:6839b2c0838db525f8422ba3870dc4731edfddf514e24d5d472277b08a4a1b1f
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l23_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:ff11b7670e19c26fb25759da8e1cfc913d0615edb297cd1928c2ecdd60500a8f
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l2_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:55ccb4f87cccede314e377b4f816a539d7862992e1ba4bd0b230a320ff661abc
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l3_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:c402b7518050575e78f8b0162a4a3f911d54390b3d3b29f8df833c78e0149e06
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l4_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:74f5ec27b61c8008cba08a2b3e64d8bde5716852a5d1b4a35ce45476e6c83db8
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l5_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:48823dada608acb7ddc1c2735db1bcb30c5150d972ee454d52b25c4fefbe2911
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l6_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:6ca1277efcc43ba1767970b773cb5f1f478d795991ea01471eee59f755d188c8
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l7_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:2312ada84911e6a306846f95dd8889ede9d9e71d09cfb7cead34b87a287fea2e
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l8_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:e7bdc2563f8f6efbefe121cb1b20d96166183453ce9ef79349257a3eba0aa28c
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_p128_l9_together.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:b28414e4228a9a8459d18af962c5149292310076116b4012648ca72727c6de7f
         | 
| 3 | 
            +
            size 17994026
         | 
    	
        CosyVoice-BlankEN-Ax650-prefill_512/qwen2_post.axmodel
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:e60b8f3aba22079cbc0e71cad0ce91ad662493266f540eb46124f285a9ccc519
         | 
| 3 | 
            +
            size 147957523
         | 
    	
        README.md
    CHANGED
    
    | @@ -1,3 +1,139 @@ | |
| 1 | 
            -
            ---
         | 
| 2 | 
            -
            license: mit
         | 
| 3 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            ---
         | 
| 2 | 
            +
            license: mit
         | 
| 3 | 
            +
            language:
         | 
| 4 | 
            +
            - en
         | 
| 5 | 
            +
            - zh
         | 
| 6 | 
            +
            base_model:
         | 
| 7 | 
            +
            - CosyVoice2
         | 
| 8 | 
            +
            pipeline_tag: Text-to-Speech
         | 
| 9 | 
            +
            library_name: transformers
         | 
| 10 | 
            +
            tags:
         | 
| 11 | 
            +
            - CosyVoice2
         | 
| 12 | 
            +
            - Speech
         | 
| 13 | 
            +
            ---
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            # CosyVoice2  
         | 
| 16 | 
            +
            This version of CosyVoice2 has been converted to run on the Axera NPU using **w8a16** quantization.
         | 
| 17 | 
            +
            Compatible with Pulsar2 version: 4.2
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            ## Convert tools links:
         | 
| 20 | 
            +
            For those who are interested in model conversion, you can try to export axmodel through the original repo : 
         | 
| 21 | 
            +
            [Cosyvoice](https://github.com/FunAudioLLM/CosyVoice)
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            [Pulsar2 Link, How to Convert LLM from Huggingface to axmodel](https://pulsar2-docs.readthedocs.io/en/latest/appendix/build_llm.html) 
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            [AXera NPU HOST LLM Runtime](https://github.com/AXERA-TECH/Cosyvoice2.Axera) 
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            ## Support Platform
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            - AX650
         | 
| 30 | 
            +
              - AX650N DEMO Board
         | 
| 31 | 
            +
             | 
| 32 | 
            +
             | 
| 33 | 
            +
            **Speech Generation**  
         | 
| 34 | 
            +
            | Stage | Time |
         | 
| 35 | 
            +
            |------|------|
         | 
| 36 | 
            +
            | llm prefill ( input_token_num + prompt_token_num 在 [0,128 ] ) | 104 ms  | 
         | 
| 37 | 
            +
            | llm prefill ( input_token_num + prompt_token_num 在 [128,256 ] ) | 234 ms  | 
         | 
| 38 | 
            +
            | Decode  |  21.24 token/s token/s |
         | 
| 39 | 
            +
             | 
| 40 | 
            +
            ## How to use
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            Download all files from this repository to the device  
         | 
| 43 | 
            +
             | 
| 44 | 
            +
            ### 1. Text to Speech (Voice Cloning)    
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            #### 1. Install python library    
         | 
| 47 | 
            +
            Steps 2 and 3 require the use of these Python packages. If you run Steps 2 and 3 on a PC, install them on the PC.  
         | 
| 48 | 
            +
            ```
         | 
| 49 | 
            +
            pip3 install -r scripts/requirements.txt
         | 
| 50 | 
            +
            ```  
         | 
| 51 | 
            +
             | 
| 52 | 
            +
            #### 2. Process Prompt Speech    
         | 
| 53 | 
            +
            ```
         | 
| 54 | 
            +
            python scripts/process_prompt.py
         | 
| 55 | 
            +
            ```
         | 
| 56 | 
            +
             | 
| 57 | 
            +
            Pass parameters according to the actual situation.
         | 
| 58 | 
            +
            ```
         | 
| 59 | 
            +
            args.add_argument('--model_dir', type=str, default="../../model_convert/pretrained_models/CosyVoice2-0.5B/")
         | 
| 60 | 
            +
            args.add_argument('--wetext_dir', type=str, default="../../model_convert/pengzhendong/wetext/")
         | 
| 61 | 
            +
            args.add_argument('--sample_rate', type=int, default=24000)
         | 
| 62 | 
            +
            args.add_argument('--zero_shot_spk_id', type=str, default="")
         | 
| 63 | 
            +
            args.add_argument('--tts_text', type=str, default="君不见黄河之水天上来,奔流到海不复回。君不见高堂明镜悲白发,朝如青丝暮成雪。")
         | 
| 64 | 
            +
            args.add_argument('--prompt_text', type=str, default="希望你以后能够做的比我还好呦。")
         | 
| 65 | 
            +
            args.add_argument('--prompt_speech', type=str, default="../../model_convert/asset/zero_shot_prompt.wav")
         | 
| 66 | 
            +
            ```
         | 
| 67 | 
            +
             | 
| 68 | 
            +
            #### 3. Start HTTP Tokenizer Server  
         | 
| 69 | 
            +
            ```
         | 
| 70 | 
            +
            cd scripts
         | 
| 71 | 
            +
            python cosyvoice2_tokenizer.py --host {your host} --port {your port}   
         | 
| 72 | 
            +
            ```
         | 
| 73 | 
            +
             | 
| 74 | 
            +
            #### 4. Run on AX650 Board  
         | 
| 75 | 
            +
            1) Moidfy the HTTP host in `run.sh`.  
         | 
| 76 | 
            +
            2) Copy `scripts/run.sh`, `build/install/bin/main`, `process_prompt.py 生成的文件` to AX650 Board  
         | 
| 77 | 
            +
            3) Run `run.sh`  
         | 
| 78 | 
            +
            ```shell
         | 
| 79 | 
            +
            root@ax650 ~/yongqiang/lhj/Cosyvoice2.Axera/cpp/src # bash run.sh 
         | 
| 80 | 
            +
            rm: cannot remove 'output*.wav': No such file or directory
         | 
| 81 | 
            +
            [I][                            Init][ 108]: LLM init start
         | 
| 82 | 
            +
            [I][                            Init][  34]: connect http://10.122.86.184:12345 ok
         | 
| 83 | 
            +
            bos_id: 0, eos_id: 1773
         | 
| 84 | 
            +
              7% | ███                               |   2 /  27 [3.11s<42.04s, 0.64 count/s] embed_selector init ok[I][                            Init][ 138]: attr.axmodel_num:24
         | 
| 85 | 
            +
            100% | ████████████████████████████████ |  27 /  27 [10.32s<10.32s, 2.62 count/s] init post axmodel ok,remain_cmm(7178 MB)
         | 
| 86 | 
            +
            [I][                            Init][ 216]: max_token_len : 1023
         | 
| 87 | 
            +
            [I][                            Init][ 221]: kv_cache_size : 128, kv_cache_num: 1023
         | 
| 88 | 
            +
            [I][                            Init][ 229]: prefill_token_num : 128
         | 
| 89 | 
            +
            [I][                            Init][ 233]: grp: 1, prefill_max_token_num : 1
         | 
| 90 | 
            +
            [I][                            Init][ 233]: grp: 2, prefill_max_token_num : 128
         | 
| 91 | 
            +
            [I][                            Init][ 233]: grp: 3, prefill_max_token_num : 256
         | 
| 92 | 
            +
            [I][                            Init][ 233]: grp: 4, prefill_max_token_num : 384
         | 
| 93 | 
            +
            [I][                            Init][ 233]: grp: 5, prefill_max_token_num : 512
         | 
| 94 | 
            +
            [I][                            Init][ 237]: prefill_max_token_num : 512
         | 
| 95 | 
            +
            [I][                            Init][ 249]: LLM init ok
         | 
| 96 | 
            +
            [I][                            Init][ 154]: Token2Wav init ok
         | 
| 97 | 
            +
            [I][                            main][ 273]: 
         | 
| 98 | 
            +
            [I][                             Run][ 388]: input token num : 142, prefill_split_num : 2
         | 
| 99 | 
            +
            [I][                             Run][ 422]: input_num_token:128
         | 
| 100 | 
            +
            [I][                             Run][ 422]: input_num_token:14
         | 
| 101 | 
            +
            [I][                             Run][ 607]: ttft: 236.90 ms
         | 
| 102 | 
            +
            [Main/Token2Wav Thread] Processing batch of 28 tokens...
         | 
| 103 | 
            +
            Successfully saved audio to output_0.wav (32-bit Float PCM).
         | 
| 104 | 
            +
            [Main/Token2Wav Thread] Processing batch of 53 tokens...
         | 
| 105 | 
            +
            Successfully saved audio to output_1.wav (32-bit Float PCM).
         | 
| 106 | 
            +
            [Main/Token2Wav Thread] Processing batch of 78 tokens...
         | 
| 107 | 
            +
            Successfully saved audio to output_2.wav (32-bit Float PCM).
         | 
| 108 | 
            +
            [Main/Token2Wav Thread] Processing batch of 78 tokens...
         | 
| 109 | 
            +
            Successfully saved audio to output_3.wav (32-bit Float PCM).
         | 
| 110 | 
            +
            [Main/Token2Wav Thread] Processing batch of 78 tokens...
         | 
| 111 | 
            +
            Successfully saved audio to output_4.wav (32-bit Float PCM).
         | 
| 112 | 
            +
            [Main/Token2Wav Thread] Processing batch of 78 tokens...
         | 
| 113 | 
            +
            Successfully saved audio to output_5.wav (32-bit Float PCM).
         | 
| 114 | 
            +
            [Main/Token2Wav Thread] Processing batch of 78 tokens...
         | 
| 115 | 
            +
            Successfully saved audio to output_6.wav (32-bit Float PCM).
         | 
| 116 | 
            +
            [Main/Token2Wav Thread] Processing batch of 78 tokens...
         | 
| 117 | 
            +
            Successfully saved audio to output_7.wav (32-bit Float PCM).
         | 
| 118 | 
            +
            [Main/Token2Wav Thread] Processing batch of 78 tokens...
         | 
| 119 | 
            +
            Successfully saved audio to output_8.wav (32-bit Float PCM).
         | 
| 120 | 
            +
            [Main/Token2Wav Thread] Processing batch of 78 tokens...
         | 
| 121 | 
            +
            Successfully saved audio to output_9.wav (32-bit Float PCM).
         | 
| 122 | 
            +
            [I][                             Run][ 723]: hit eos, llm finished
         | 
| 123 | 
            +
            [I][                             Run][ 753]: llm finished
         | 
| 124 | 
            +
            [Main/Token2Wav Thread] Buffer is empty and LLM finished. Exiting.
         | 
| 125 | 
            +
             | 
| 126 | 
            +
             | 
| 127 | 
            +
            [I][                             Run][ 758]: total decode tokens:271
         | 
| 128 | 
            +
            [N][                             Run][ 759]: hit eos,avg 21.47 token/s
         | 
| 129 | 
            +
             | 
| 130 | 
            +
            Successfully saved audio to output_10.wav (32-bit Float PCM).
         | 
| 131 | 
            +
            Successfully saved audio to output.wav (32-bit Float PCM).
         | 
| 132 | 
            +
             | 
| 133 | 
            +
            Voice generation pipeline completed.
         | 
| 134 | 
            +
            Type "q" to exit, Ctrl+c to stop current running
         | 
| 135 | 
            +
            text >> 
         | 
| 136 | 
            +
            ```
         | 
| 137 | 
            +
             | 
| 138 | 
            +
            Output Speech:
         | 
| 139 | 
            +
            [output.wav](asset/output.wav)
         | 
    	
        asset/cross_lingual_prompt.wav
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:353a7715c2e4811f4045658b29d1ce67ecad5120e09de10ce890f1763aab486c
         | 
| 3 | 
            +
            size 606404
         | 
    	
        asset/dingding.png
    ADDED
    
    |   | 
| Git LFS Details
 | 
    	
        asset/output.wav
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:5c7a4c3837145df17e851c177f849446036e6f541d78eb6e107ea6b9e7b07672
         | 
| 3 | 
            +
            size 1067564
         | 
    	
        asset/zero_shot_prompt.wav
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:bd199eb7109fd6ce9943cb297e3cf350c1073af014063dfadbdc100230526243
         | 
| 3 | 
            +
            size 111496
         | 
    	
        frontend-onnx/campplus.onnx
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
         | 
| 3 | 
            +
            size 28303423
         | 
    	
        frontend-onnx/speech_tokenizer_v2.onnx
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:d43342aa12163a80bf07bffb94c9de2e120a8df2f9917cd2f642e7f4219c6f71
         | 
| 3 | 
            +
            size 496082973
         | 
    	
        main_ax650
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:4338e14a404311a5533f2e27d17726a5142e9fe7b92016ea2e845dad9dfe8bd1
         | 
| 3 | 
            +
            size 6641680
         | 
    	
        run.sh
    ADDED
    
    | @@ -0,0 +1,20 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            LLM_DIR=CosyVoice-BlankEN-Ax650-prefill_512/
         | 
| 2 | 
            +
            TOKEN2WAV_DIR=token2wav-axmodels/
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            rm output*.wav
         | 
| 5 | 
            +
            ./main_ax650 \
         | 
| 6 | 
            +
            --template_filename_axmodel "${LLM_DIR}/qwen2_p128_l%d_together.axmodel" \
         | 
| 7 | 
            +
            --token2wav_axmodel_dir $TOKEN2WAV_DIR \
         | 
| 8 | 
            +
            --axmodel_num 24 \
         | 
| 9 | 
            +
            --bos 0 --eos 0 \
         | 
| 10 | 
            +
            --filename_tokenizer_model "http://10.122.86.184:12345" \
         | 
| 11 | 
            +
            --filename_post_axmodel "${LLM_DIR}/qwen2_post.axmodel" \
         | 
| 12 | 
            +
            --filename_decoder_axmodel "${LLM_DIR}/llm_decoder.axmodel" \
         | 
| 13 | 
            +
            --filename_tokens_embed "${LLM_DIR}/model.embed_tokens.weight.bfloat16.bin" \
         | 
| 14 | 
            +
            --filename_llm_embed "${LLM_DIR}/llm.llm_embedding.float16.bin" \
         | 
| 15 | 
            +
            --filename_speech_embed "${LLM_DIR}/llm.speech_embedding.float16.bin" \
         | 
| 16 | 
            +
            --continue 0 \
         | 
| 17 | 
            +
            --text "君不见黄河之水天上来,奔流到海不复回。君不见高堂明镜悲白发,朝如青丝暮成雪。"
         | 
| 18 | 
            +
             | 
| 19 | 
            +
             | 
| 20 | 
            +
            chmod 777 output*.wav
         | 
    	
        scripts/CosyVoice-BlankEN/tokenizer_config.json
    ADDED
    
    | @@ -0,0 +1,40 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "add_prefix_space": false,
         | 
| 3 | 
            +
              "added_tokens_decoder": {
         | 
| 4 | 
            +
                "151643": {
         | 
| 5 | 
            +
                  "content": "<|endoftext|>",
         | 
| 6 | 
            +
                  "lstrip": false,
         | 
| 7 | 
            +
                  "normalized": false,
         | 
| 8 | 
            +
                  "rstrip": false,
         | 
| 9 | 
            +
                  "single_word": false,
         | 
| 10 | 
            +
                  "special": true
         | 
| 11 | 
            +
                },
         | 
| 12 | 
            +
                "151644": {
         | 
| 13 | 
            +
                  "content": "<|im_start|>",
         | 
| 14 | 
            +
                  "lstrip": false,
         | 
| 15 | 
            +
                  "normalized": false,
         | 
| 16 | 
            +
                  "rstrip": false,
         | 
| 17 | 
            +
                  "single_word": false,
         | 
| 18 | 
            +
                  "special": true
         | 
| 19 | 
            +
                },
         | 
| 20 | 
            +
                "151645": {
         | 
| 21 | 
            +
                  "content": "<|im_end|>",
         | 
| 22 | 
            +
                  "lstrip": false,
         | 
| 23 | 
            +
                  "normalized": false,
         | 
| 24 | 
            +
                  "rstrip": false,
         | 
| 25 | 
            +
                  "single_word": false,
         | 
| 26 | 
            +
                  "special": true
         | 
| 27 | 
            +
                }
         | 
| 28 | 
            +
              },
         | 
| 29 | 
            +
              "additional_special_tokens": ["<|im_start|>", "<|im_end|>"],
         | 
| 30 | 
            +
              "bos_token": null,
         | 
| 31 | 
            +
              "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
         | 
| 32 | 
            +
              "clean_up_tokenization_spaces": false,
         | 
| 33 | 
            +
              "eos_token": "<|im_end|>",
         | 
| 34 | 
            +
              "errors": "replace",
         | 
| 35 | 
            +
              "model_max_length": 32768,
         | 
| 36 | 
            +
              "pad_token": "<|endoftext|>",
         | 
| 37 | 
            +
              "split_special_tokens": false,
         | 
| 38 | 
            +
              "tokenizer_class": "Qwen2Tokenizer",
         | 
| 39 | 
            +
              "unk_token": null
         | 
| 40 | 
            +
            }
         | 
    	
        scripts/CosyVoice-BlankEN/vocab.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        scripts/audio.py
    ADDED
    
    | @@ -0,0 +1,83 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import numpy as np
         | 
| 2 | 
            +
            import torch
         | 
| 3 | 
            +
            import torch.utils.data
         | 
| 4 | 
            +
            from librosa.filters import mel as librosa_mel_fn
         | 
| 5 | 
            +
            from scipy.io.wavfile import read
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            MAX_WAV_VALUE = 32768.0
         | 
| 8 | 
            +
             | 
| 9 | 
            +
             | 
| 10 | 
            +
            def load_wav(full_path):
         | 
| 11 | 
            +
                sampling_rate, data = read(full_path)
         | 
| 12 | 
            +
                return data, sampling_rate
         | 
| 13 | 
            +
             | 
| 14 | 
            +
             | 
| 15 | 
            +
            def dynamic_range_compression(x, C=1, clip_val=1e-5):
         | 
| 16 | 
            +
                return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
         | 
| 17 | 
            +
             | 
| 18 | 
            +
             | 
| 19 | 
            +
            def dynamic_range_decompression(x, C=1):
         | 
| 20 | 
            +
                return np.exp(x) / C
         | 
| 21 | 
            +
             | 
| 22 | 
            +
             | 
| 23 | 
            +
            def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
         | 
| 24 | 
            +
                return torch.log(torch.clamp(x, min=clip_val) * C)
         | 
| 25 | 
            +
             | 
| 26 | 
            +
             | 
| 27 | 
            +
            def dynamic_range_decompression_torch(x, C=1):
         | 
| 28 | 
            +
                return torch.exp(x) / C
         | 
| 29 | 
            +
             | 
| 30 | 
            +
             | 
| 31 | 
            +
            def spectral_normalize_torch(magnitudes):
         | 
| 32 | 
            +
                output = dynamic_range_compression_torch(magnitudes)
         | 
| 33 | 
            +
                return output
         | 
| 34 | 
            +
             | 
| 35 | 
            +
             | 
| 36 | 
            +
            def spectral_de_normalize_torch(magnitudes):
         | 
| 37 | 
            +
                output = dynamic_range_decompression_torch(magnitudes)
         | 
| 38 | 
            +
                return output
         | 
| 39 | 
            +
             | 
| 40 | 
            +
             | 
| 41 | 
            +
            mel_basis = {}
         | 
| 42 | 
            +
            hann_window = {}
         | 
| 43 | 
            +
             | 
| 44 | 
            +
             | 
| 45 | 
            +
            def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
         | 
| 46 | 
            +
                if torch.min(y) < -1.0:
         | 
| 47 | 
            +
                    print("min value is ", torch.min(y))
         | 
| 48 | 
            +
                if torch.max(y) > 1.0:
         | 
| 49 | 
            +
                    print("max value is ", torch.max(y))
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                global mel_basis, hann_window  # pylint: disable=global-statement
         | 
| 52 | 
            +
                print("fmax",fmax)
         | 
| 53 | 
            +
                if f"{str(fmax)}_{str(y.device)}" not in mel_basis:
         | 
| 54 | 
            +
                    mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
         | 
| 55 | 
            +
                    mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
         | 
| 56 | 
            +
                    hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                y = torch.nn.functional.pad(
         | 
| 59 | 
            +
                    y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
         | 
| 60 | 
            +
                )
         | 
| 61 | 
            +
                y = y.squeeze(1)
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                spec = torch.view_as_real(
         | 
| 64 | 
            +
                    torch.stft(
         | 
| 65 | 
            +
                        y,
         | 
| 66 | 
            +
                        n_fft,
         | 
| 67 | 
            +
                        hop_length=hop_size,
         | 
| 68 | 
            +
                        win_length=win_size,
         | 
| 69 | 
            +
                        window=hann_window[str(y.device)],
         | 
| 70 | 
            +
                        center=center,
         | 
| 71 | 
            +
                        pad_mode="reflect",
         | 
| 72 | 
            +
                        normalized=False,
         | 
| 73 | 
            +
                        onesided=True,
         | 
| 74 | 
            +
                        return_complex=True,
         | 
| 75 | 
            +
                    )
         | 
| 76 | 
            +
                )
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
         | 
| 81 | 
            +
                spec = spectral_normalize_torch(spec)
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                return spec
         | 
    	
        scripts/cosyvoice2_tokenizer.py
    ADDED
    
    | @@ -0,0 +1,124 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from transformers import AutoTokenizer, PreTrainedTokenizerFast
         | 
| 2 | 
            +
            from http.server import HTTPServer, BaseHTTPRequestHandler
         | 
| 3 | 
            +
            import json
         | 
| 4 | 
            +
            import argparse
         | 
| 5 | 
            +
            from tokenizer.tokenizer import get_qwen_tokenizer
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            class Tokenizer_Http():
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                def __init__(self):
         | 
| 10 | 
            +
                    
         | 
| 11 | 
            +
                    self.tokenizer = get_qwen_tokenizer("CosyVoice-BlankEN", True)
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                def encode(self, prompt):
         | 
| 14 | 
            +
                
         | 
| 15 | 
            +
                    token_ids = self.tokenizer.encode(prompt, allowed_special="all")
         | 
| 16 | 
            +
                    return token_ids
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                def decode(self, token_ids):
         | 
| 19 | 
            +
                    return self.tokenizer.decode(token_ids)
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                # @property
         | 
| 22 | 
            +
                # def bos_id(self):
         | 
| 23 | 
            +
                #     return self.tokenizer.bos_token_id
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                @property
         | 
| 26 | 
            +
                def eos_id(self):
         | 
| 27 | 
            +
                    return 1773
         | 
| 28 | 
            +
                
         | 
| 29 | 
            +
                # @property
         | 
| 30 | 
            +
                # def bos_token(self):
         | 
| 31 | 
            +
                #     return self.tokenizer.bos_token
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                @property
         | 
| 34 | 
            +
                def eos_token(self):
         | 
| 35 | 
            +
                    return "<|eot_id|>"
         | 
| 36 | 
            +
             | 
| 37 | 
            +
             | 
| 38 | 
            +
            tokenizer = Tokenizer_Http()
         | 
| 39 | 
            +
             | 
| 40 | 
            +
            # print(tokenizer.bos_id, tokenizer.bos_token, tokenizer.eos_id, tokenizer.eos_token)
         | 
| 41 | 
            +
            print(tokenizer.encode("hello world"))
         | 
| 42 | 
            +
             | 
| 43 | 
            +
             | 
| 44 | 
            +
            class Request(BaseHTTPRequestHandler):
         | 
| 45 | 
            +
                #通过类继承,新定义类
         | 
| 46 | 
            +
                timeout = 5
         | 
| 47 | 
            +
                server_version = 'Apache'
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                def do_GET(self):
         | 
| 50 | 
            +
                    print(self.path)
         | 
| 51 | 
            +
                    #在新类中定义get的内容(当客户端向该服务端使用get请求时,本服务端将如下运行)
         | 
| 52 | 
            +
                    self.send_response(200)
         | 
| 53 | 
            +
                    self.send_header("type", "get")  #设置响应头,可省略或设置多个
         | 
| 54 | 
            +
                    self.end_headers()
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                    if self.path == '/bos_id':
         | 
| 57 | 
            +
                        bos_id = tokenizer.bos_id
         | 
| 58 | 
            +
                        # print(bos_id)
         | 
| 59 | 
            +
                        # to json
         | 
| 60 | 
            +
                        if bos_id is None:
         | 
| 61 | 
            +
                            msg = json.dumps({'bos_id': -1})
         | 
| 62 | 
            +
                        else:
         | 
| 63 | 
            +
                            msg = json.dumps({'bos_id': bos_id})
         | 
| 64 | 
            +
                    elif self.path == '/eos_id':
         | 
| 65 | 
            +
                        eos_id = tokenizer.eos_id
         | 
| 66 | 
            +
                        if eos_id is None:
         | 
| 67 | 
            +
                            msg = json.dumps({'eos_id': -1})
         | 
| 68 | 
            +
                        else:
         | 
| 69 | 
            +
                            msg = json.dumps({'eos_id': eos_id})
         | 
| 70 | 
            +
                    else:
         | 
| 71 | 
            +
                        msg = 'error'
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                    print(msg)
         | 
| 74 | 
            +
                    msg = str(msg).encode()  #转为str再转为byte格式
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                    self.wfile.write(msg)  #将byte格式的信息返回给客户端
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                def do_POST(self):
         | 
| 79 | 
            +
                    #在新类中定义post的内容(当客户端向该服务端使用post请求时,本服务端将如下运行)
         | 
| 80 | 
            +
                    data = self.rfile.read(int(
         | 
| 81 | 
            +
                        self.headers['content-length']))  #获取从客户端传入的参数(byte格式)
         | 
| 82 | 
            +
                    data = data.decode()  #将byte格式转为str格式
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                    self.send_response(200)
         | 
| 85 | 
            +
                    self.send_header("type", "post")  #设置响应头,可省略或设置多个
         | 
| 86 | 
            +
                    self.end_headers()
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                    if self.path == '/encode':
         | 
| 89 | 
            +
                        req = json.loads(data)
         | 
| 90 | 
            +
                        prompt = req['text']
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                        token_ids = tokenizer.encode(prompt)
         | 
| 93 | 
            +
                        if token_ids is None:
         | 
| 94 | 
            +
                            msg = json.dumps({'token_ids': -1})
         | 
| 95 | 
            +
                        else:
         | 
| 96 | 
            +
                            msg = json.dumps({'token_ids': token_ids})
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                    elif self.path == '/decode':
         | 
| 99 | 
            +
                        req = json.loads(data)
         | 
| 100 | 
            +
                        token_ids = req['token_ids']
         | 
| 101 | 
            +
                        text = tokenizer.decode(token_ids)
         | 
| 102 | 
            +
                        if text is None:
         | 
| 103 | 
            +
                            msg = json.dumps({'text': ""})
         | 
| 104 | 
            +
                        else:
         | 
| 105 | 
            +
                            msg = json.dumps({'text': text})
         | 
| 106 | 
            +
                    else:
         | 
| 107 | 
            +
                        msg = 'error'
         | 
| 108 | 
            +
                    print(msg)
         | 
| 109 | 
            +
                    msg = str(msg).encode()  #转为str再转为byte格式
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                    self.wfile.write(msg)  #将byte格式的信息返回给客户端
         | 
| 112 | 
            +
             | 
| 113 | 
            +
             | 
| 114 | 
            +
            if __name__ == "__main__":
         | 
| 115 | 
            +
             | 
| 116 | 
            +
                args = argparse.ArgumentParser()
         | 
| 117 | 
            +
                args.add_argument('--host', type=str, default='localhost')
         | 
| 118 | 
            +
                args.add_argument('--port', type=int, default=12345)
         | 
| 119 | 
            +
                args = args.parse_args()
         | 
| 120 | 
            +
             | 
| 121 | 
            +
                host = (args.host, args.port)  #设定地址与端口号,'localhost'等价于'127.0.0.1'
         | 
| 122 | 
            +
                print('http://%s:%s' % host)
         | 
| 123 | 
            +
                server = HTTPServer(host, Request)  #根据地址端口号和新定义的类,创建服务器实例
         | 
| 124 | 
            +
                server.serve_forever()  #开启服务
         | 
