Mejiro J commited on
Commit
6992dff
·
1 Parent(s): daa8f0b

fuck you guys for making audo so difficult

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Reference_Voice/HonkaiSR/Kafka/audio.mp3 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ venv
Reference_Voice/HonkaiSR/Kafka/audio.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48f2e0d62986fb70a8f5b70e492239bb130f7ab9538ad699e9619198bfca235b
3
+ size 267329
Reference_Voice/text.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "HonkaiSR": {
3
+ "Kafka" : "我尤其钟爱丝绒质地的大衣,脆弱而美丽,很难保养,稍有不慎便有损它的光泽。"
4
+ }
5
+ }
app.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import spaces
3
+ import gradio as gr
4
+ import torch
5
+ import soundfile as sf
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM
7
+ from xcodec2.modeling_xcodec2 import XCodec2Model
8
+ import tempfile
9
+ import json
10
+
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+
13
+ ####################
14
+ # 全局加载模型
15
+ ####################
16
+ llasa_3b = "HKUSTAudio/Llasa-1B-multi-speakers-genshin-zh-en-ja-ko"
17
+ print("Loading tokenizer & model ...")
18
+ tokenizer = AutoTokenizer.from_pretrained(llasa_3b)
19
+ model = AutoModelForCausalLM.from_pretrained(llasa_3b)
20
+ model.eval().to(device)
21
+
22
+ print("Loading XCodec2Model ...")
23
+ codec_model_path = "HKUSTAudio/xcodec2"
24
+ Codec_model = XCodec2Model.from_pretrained(codec_model_path)
25
+ Codec_model.eval().to(device)
26
+
27
+ print("Models loaded.")
28
+
29
+ prompt_text_dict = json.load(open("Reference_Voice/text.json", "r", encoding="utf-8"))
30
+
31
+ ####################
32
+ # 推理用函数
33
+ ####################
34
+ def extract_speech_ids(speech_tokens_str):
35
+ """
36
+ 将类似 <|s_23456|> 还原为 int 23456
37
+ """
38
+ speech_ids = []
39
+ for token_str in speech_tokens_str:
40
+ if token_str.startswith("<|s_") and token_str.endswith("|>"):
41
+ num_str = token_str[4:-2]
42
+ num = int(num_str)
43
+ speech_ids.append(num)
44
+ else:
45
+ print(f"Unexpected token: {token_str}")
46
+ return speech_ids
47
+
48
+ def ids_to_speech_tokens(speech_ids):
49
+
50
+ speech_tokens_str = []
51
+ for speech_id in speech_ids:
52
+ speech_tokens_str.append(f"<|s_{speech_id}|>")
53
+ return speech_tokens_str
54
+
55
+ @spaces.GPU
56
+ def text2speech(target_text, game, speaker):
57
+ """
58
+ 将文本转为语音波形,并返回音频文件路径
59
+ """
60
+
61
+ prompt_wav, sr = sf.read(f"Reference_Voice/{game}/{speaker}/audio.mp3")
62
+
63
+ prompt_text = prompt_text_dict[game][speaker]
64
+
65
+ input_text = prompt_text + " " + target_text
66
+ # read text file in the same directory with name text
67
+
68
+
69
+ with torch.no_grad():
70
+
71
+ # Encode the prompt wav
72
+ vq_code_prompt = Codec_model.encode_code(input_waveform=prompt_wav)
73
+ print("Prompt Vq Code Shape:", vq_code_prompt.shape )
74
+
75
+ vq_code_prompt = vq_code_prompt[0,0,:]
76
+ # Convert int 12345 to token <|s_12345|>
77
+ speech_ids_prefix = ids_to_speech_tokens(vq_code_prompt)
78
+
79
+ # 在输入文本前后拼接提示token
80
+ formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"
81
+
82
+ # Tokenize the text and the speech prefix
83
+ chat = [
84
+ {"role": "user", "content": "Convert the text to speech:" + formatted_text},
85
+ {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>" + ''.join(speech_ids_prefix)}
86
+ ]
87
+
88
+ input_ids = tokenizer.apply_chat_template(
89
+ chat,
90
+ tokenize=True,
91
+ return_tensors='pt',
92
+ continue_final_message=True
93
+ )
94
+ input_ids = input_ids.to('cuda')
95
+ speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')
96
+
97
+ # Generate the speech autoregressively
98
+ outputs = model.generate(
99
+ input_ids,
100
+ max_length=2048, # We trained our model with a max length of 2048
101
+ eos_token_id= speech_end_id ,
102
+ do_sample=True,
103
+ top_p=1,
104
+ temperature=0.8,
105
+ )
106
+ # Extract the speech tokens
107
+ generated_ids = outputs[0][input_ids.shape[1]-len(speech_ids_prefix):-1]
108
+
109
+ speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
110
+
111
+ # Convert token <|s_23456|> to int 23456
112
+ speech_tokens = extract_speech_ids(speech_tokens)
113
+
114
+ speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0)
115
+
116
+ # Decode the speech tokens to speech waveform
117
+ gen_wav = Codec_model.decode_code(speech_tokens)
118
+
119
+ # 获取音频数据和采样率
120
+ audio = gen_wav[0, 0, :].cpu().numpy()
121
+ sample_rate = 16000
122
+
123
+ # 将音频保存到临时文件
124
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
125
+ sf.write(tmpfile.name, audio, sample_rate)
126
+ audio_path = tmpfile.name
127
+
128
+ return audio_path
129
+
130
+ ####################
131
+ # Gradio 界面
132
+ ####################
133
+ game_choices = [
134
+ "HonkaiSR",
135
+ "Zenless",
136
+ "Genshin"
137
+ ]
138
+
139
+ speaker_game_dict = {
140
+ "HonkaiSR": [
141
+ "Kafka", "Firefly", "Silverwolf"
142
+ ],
143
+ "Zenless": [
144
+ "Yixuan", "Miyabi", "Jane"
145
+ ],
146
+ "Genshin": [
147
+ "Mavuika", "Navia", "Kokomi", "Furina", "Yoimiya"
148
+ ]
149
+ }
150
+ #["puck", "kore"]
151
+
152
+ if __name__ == "__main__":
153
+ with gr.Blocks() as demo:
154
+ gr.Markdown("## Text to Speech Generation")
155
+ with gr.Row():
156
+ game = gr.Dropdown(label="Game", choices=game_choices, value="HonkaiSR")
157
+ speaker = gr.Dropdown(label="Speaker", choices=speaker_game_dict[game.value], value="Kafka")
158
+
159
+ target_text = gr.Textbox(label="Target Text", placeholder="Enter the text you want to convert to speech.")
160
+ output_audio = gr.Audio(label="Generated Audio", type="filepath")
161
+
162
+ def update_speakers(game):
163
+ return speaker_game_dict[game]
164
+
165
+ game.change(update_speakers, inputs=game, outputs=speaker)
166
+
167
+ text2speech_button = gr.Button("Generate Speech")
168
+ text2speech_button.click(text2speech, inputs=[target_text, game, speaker], outputs=output_audio)
169
+
170
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ xcodec2==0.1.3
2
+ soundfile