Purupuru188 commited on
Commit
c48be2a
1 Parent(s): 03d5607

Upload 4 files

Browse files
fastfinetuning_kr/VC_inference.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import torch
4
+ from torch import no_grad, LongTensor
5
+ import argparse
6
+ import commons
7
+ from mel_processing import spectrogram_torch
8
+ import utils
9
+ from models import SynthesizerTrn
10
+ import gradio as gr
11
+ import librosa
12
+ import webbrowser
13
+
14
+ from text import text_to_sequence, _clean_text
15
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
16
+ import logging
17
+ logging.getLogger("PIL").setLevel(logging.WARNING)
18
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
19
+ logging.getLogger("markdown_it").setLevel(logging.WARNING)
20
+ logging.getLogger("httpx").setLevel(logging.WARNING)
21
+ logging.getLogger("asyncio").setLevel(logging.WARNING)
22
+
23
+ language_marks = {
24
+ "Japanese": "",
25
+ "日本語": "[JA]",
26
+ "简体中文": "[ZH]",
27
+ "English": "[EN]",
28
+ "한국어": "[KO]",
29
+ "Mix": "",
30
+ }
31
+ lang = ['日本語', '简体中文', 'English', 'Mix','한국어']
32
+ def get_text(text, hps, is_symbol):
33
+ text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
34
+ if hps.data.add_blank:
35
+ text_norm = commons.intersperse(text_norm, 0)
36
+ text_norm = LongTensor(text_norm)
37
+ return text_norm
38
+
39
+ def create_tts_fn(model, hps, speaker_ids):
40
+ def tts_fn(text, speaker, language, speed):
41
+ if language is not None:
42
+ text = language_marks[language] + text + language_marks[language]
43
+ speaker_id = speaker_ids[speaker]
44
+ stn_tst = get_text(text, hps, False)
45
+ with no_grad():
46
+ x_tst = stn_tst.unsqueeze(0).to(device)
47
+ x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
48
+ sid = LongTensor([speaker_id]).to(device)
49
+ audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
50
+ length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
51
+ del stn_tst, x_tst, x_tst_lengths, sid
52
+ return "Success", (hps.data.sampling_rate, audio)
53
+
54
+ return tts_fn
55
+
56
+ def create_vc_fn(model, hps, speaker_ids):
57
+ def vc_fn(original_speaker, target_speaker, record_audio, upload_audio):
58
+ input_audio = record_audio if record_audio is not None else upload_audio
59
+ if input_audio is None:
60
+ return "You need to record or upload an audio", None
61
+ sampling_rate, audio = input_audio
62
+ original_speaker_id = speaker_ids[original_speaker]
63
+ target_speaker_id = speaker_ids[target_speaker]
64
+
65
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
66
+ if len(audio.shape) > 1:
67
+ audio = librosa.to_mono(audio.transpose(1, 0))
68
+ if sampling_rate != hps.data.sampling_rate:
69
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
70
+ with no_grad():
71
+ y = torch.FloatTensor(audio)
72
+ y = y / max(-y.min(), y.max()) / 0.99
73
+ y = y.to(device)
74
+ y = y.unsqueeze(0)
75
+ spec = spectrogram_torch(y, hps.data.filter_length,
76
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
77
+ center=False).to(device)
78
+ spec_lengths = LongTensor([spec.size(-1)]).to(device)
79
+ sid_src = LongTensor([original_speaker_id]).to(device)
80
+ sid_tgt = LongTensor([target_speaker_id]).to(device)
81
+ audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
82
+ 0, 0].data.cpu().float().numpy()
83
+ del y, spec, spec_lengths, sid_src, sid_tgt
84
+ return "Success", (hps.data.sampling_rate, audio)
85
+
86
+ return vc_fn
87
+ if __name__ == "__main__":
88
+ parser = argparse.ArgumentParser()
89
+ parser.add_argument("--model_dir", default="./G_latest.pth", help="directory to your fine-tuned model")
90
+ parser.add_argument("--config_dir", default="./finetune_speaker.json", help="directory to your model config file")
91
+ parser.add_argument("--share", default=False, help="make link public (used in colab)")
92
+
93
+ args = parser.parse_args()
94
+ hps = utils.get_hparams_from_file(args.config_dir)
95
+
96
+
97
+ net_g = SynthesizerTrn(
98
+ len(hps.symbols),
99
+ hps.data.filter_length // 2 + 1,
100
+ hps.train.segment_size // hps.data.hop_length,
101
+ n_speakers=hps.data.n_speakers,
102
+ **hps.model).to(device)
103
+ _ = net_g.eval()
104
+
105
+ _ = utils.load_checkpoint(args.model_dir, net_g, None)
106
+ speaker_ids = hps.speakers
107
+ speakers = list(hps.speakers.keys())
108
+ tts_fn = create_tts_fn(net_g, hps, speaker_ids)
109
+ vc_fn = create_vc_fn(net_g, hps, speaker_ids)
110
+ app = gr.Blocks()
111
+ with app:
112
+ with gr.Tab("Text-to-Speech"):
113
+ with gr.Row():
114
+ with gr.Column():
115
+ textbox = gr.TextArea(label="Text",
116
+ placeholder="Type your sentence here",
117
+ value="こんにちわ。", elem_id=f"tts-input")
118
+ # select character
119
+ char_dropdown = gr.Dropdown(choices=speakers, value=speakers[0], label='character')
120
+ language_dropdown = gr.Dropdown(choices=lang, value=lang[0], label='language')
121
+ duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1,
122
+ label='速度 Speed')
123
+ with gr.Column():
124
+ text_output = gr.Textbox(label="Message")
125
+ audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
126
+ btn = gr.Button("Generate!")
127
+ btn.click(tts_fn,
128
+ inputs=[textbox, char_dropdown, language_dropdown, duration_slider,],
129
+ outputs=[text_output, audio_output])
130
+ with gr.Tab("Voice Conversion"):
131
+ gr.Markdown("""
132
+ 录制或上传声音,并选择要转换的音色。
133
+ """)
134
+ with gr.Column():
135
+ record_audio = gr.Audio(label="record your voice", source="microphone")
136
+ upload_audio = gr.Audio(label="or upload audio here", source="upload")
137
+ source_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="source speaker")
138
+ target_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="target speaker")
139
+ with gr.Column():
140
+ message_box = gr.Textbox(label="Message")
141
+ converted_audio = gr.Audio(label='converted audio')
142
+ btn = gr.Button("Convert!")
143
+ btn.click(vc_fn, inputs=[source_speaker, target_speaker, record_audio, upload_audio],
144
+ outputs=[message_box, converted_audio])
145
+ webbrowser.open("http://127.0.0.1:7860")
146
+ app.launch(share=args.share)
147
+
fastfinetuning_kr/long_audio_transcribe.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from moviepy.editor import AudioFileClip
2
+ import whisper
3
+ import os
4
+ import json
5
+ import torchaudio
6
+ import librosa
7
+ import torch
8
+ import argparse
9
+ parent_dir = "./denoised_audio/"
10
+ filelist = list(os.walk(parent_dir))[0][2]
11
+ if __name__ == "__main__":
12
+ parser = argparse.ArgumentParser()
13
+ parser.add_argument("--languages", default="CJE")
14
+ parser.add_argument("--whisper_size", default="medium")
15
+ args = parser.parse_args()
16
+ if args.languages == "CJE":
17
+ lang2token = {
18
+ 'zh': "[ZH]",
19
+ 'ja': "[JA]",
20
+ "en": "[EN]",
21
+ }
22
+ elif args.languages == "CJ":
23
+ lang2token = {
24
+ 'zh': "[ZH]",
25
+ 'ja': "[JA]",
26
+ }
27
+ elif args.languages == "C":
28
+ lang2token = {
29
+ 'zh': "[ZH]",
30
+ }
31
+ elif args.languages == "CJKE":
32
+ lang2token = {
33
+ 'zh': "[ZH]",
34
+ 'ja': "[JA]",
35
+ "en": "[EN]",
36
+ "ko": "[KO]",
37
+ }
38
+ assert(torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
39
+ with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
40
+ hps = json.load(f)
41
+ target_sr = hps['data']['sampling_rate']
42
+ model = whisper.load_model(args.whisper_size)
43
+ speaker_annos = []
44
+ for file in filelist:
45
+ print(f"transcribing {parent_dir + file}...\n")
46
+ options = dict(beam_size=5, best_of=5)
47
+ transcribe_options = dict(task="transcribe", **options)
48
+ result = model.transcribe(parent_dir + file, word_timestamps=True, **transcribe_options)
49
+ segments = result["segments"]
50
+ # result = model.transcribe(parent_dir + file)
51
+ lang = result['language']
52
+ if result['language'] not in list(lang2token.keys()):
53
+ print(f"{lang} not supported, ignoring...\n")
54
+ continue
55
+ # segment audio based on segment results
56
+ character_name = file.rstrip(".wav").split("_")[0]
57
+ code = file.rstrip(".wav").split("_")[1]
58
+ if not os.path.exists("./segmented_character_voice/" + character_name):
59
+ os.mkdir("./segmented_character_voice/" + character_name)
60
+ wav, sr = torchaudio.load(parent_dir + file, frame_offset=0, num_frames=-1, normalize=True,
61
+ channels_first=True)
62
+
63
+ for i, seg in enumerate(result['segments']):
64
+ start_time = seg['start']
65
+ end_time = seg['end']
66
+ text = seg['text']
67
+ text = lang2token[lang] + text.replace("\n", "") + lang2token[lang]
68
+ text = text + "\n"
69
+ wav_seg = wav[:, int(start_time*sr):int(end_time*sr)]
70
+ wav_seg_name = f"{character_name}_{code}_{i}.wav"
71
+ savepth = "./segmented_character_voice/" + character_name + "/" + wav_seg_name
72
+ speaker_annos.append(savepth + "|" + character_name + "|" + text)
73
+ print(f"Transcribed segment: {speaker_annos[-1]}")
74
+ # trimmed_wav_seg = librosa.effects.trim(wav_seg.squeeze().numpy())
75
+ # trimmed_wav_seg = torch.tensor(trimmed_wav_seg[0]).unsqueeze(0)
76
+ torchaudio.save(savepth, wav_seg, target_sr, channels_first=True)
77
+ if len(speaker_annos) == 0:
78
+ print("Warning: no long audios & videos found, this IS expected if you have only uploaded short audios")
79
+ print("this IS NOT expected if you have uploaded any long audios, videos or video links. Please check your file structure or make sure your audio/video language is supported.")
80
+ with open("./long_character_anno.txt", 'w', encoding='utf-8') as f:
81
+ for line in speaker_annos:
82
+ f.write(line)
fastfinetuning_kr/preprocess_v2.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import sys
5
+ sys.setrecursionlimit(500000) # Fix the error message of RecursionError: maximum recursion depth exceeded while calling a Python object. You can change the number as you want.
6
+
7
+ if __name__ == "__main__":
8
+ parser = argparse.ArgumentParser()
9
+ parser.add_argument("--add_auxiliary_data", type=bool, help="Whether to add extra data as fine-tuning helper")
10
+ parser.add_argument("--languages", default="CJE")
11
+ args = parser.parse_args()
12
+ if args.languages == "CJE":
13
+ langs = ["[ZH]", "[JA]", "[EN]"]
14
+ elif args.languages == "CJ":
15
+ langs = ["[ZH]", "[JA]"]
16
+ elif args.languages == "C":
17
+ langs = ["[ZH]"]
18
+ elif args.languages == "CJKE":
19
+ langs = ["[ZH]", "[JA]", "[EN]", "[KO]"]
20
+ new_annos = []
21
+ # Source 1: transcribed short audios
22
+ if os.path.exists("short_character_anno.txt"):
23
+ with open("short_character_anno.txt", 'r', encoding='utf-8') as f:
24
+ short_character_anno = f.readlines()
25
+ new_annos += short_character_anno
26
+ # Source 2: transcribed long audio segments
27
+ if os.path.exists("./long_character_anno.txt"):
28
+ with open("./long_character_anno.txt", 'r', encoding='utf-8') as f:
29
+ long_character_anno = f.readlines()
30
+ new_annos += long_character_anno
31
+
32
+ # Get all speaker names
33
+ speakers = []
34
+ for line in new_annos:
35
+ path, speaker, text = line.split("|")
36
+ if speaker not in speakers:
37
+ speakers.append(speaker)
38
+ assert (len(speakers) != 0), "No audio file found. Please check your uploaded file structure."
39
+ # Source 3 (Optional): sampled audios as extra training helpers
40
+ if args.add_auxiliary_data:
41
+ with open("./sampled_audio4ft.txt", 'r', encoding='utf-8') as f:
42
+ old_annos = f.readlines()
43
+ # filter old_annos according to supported languages
44
+ filtered_old_annos = []
45
+ for line in old_annos:
46
+ for lang in langs:
47
+ if lang in line:
48
+ filtered_old_annos.append(line)
49
+ old_annos = filtered_old_annos
50
+ for line in old_annos:
51
+ path, speaker, text = line.split("|")
52
+ if speaker not in speakers:
53
+ speakers.append(speaker)
54
+ num_old_voices = len(old_annos)
55
+ num_new_voices = len(new_annos)
56
+ # STEP 1: balance number of new & old voices
57
+ cc_duplicate = num_old_voices // num_new_voices
58
+ if cc_duplicate == 0:
59
+ cc_duplicate = 1
60
+
61
+
62
+ # STEP 2: modify config file
63
+ with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
64
+ hps = json.load(f)
65
+
66
+ # assign ids to new speakers
67
+ speaker2id = {}
68
+ for i, speaker in enumerate(speakers):
69
+ speaker2id[speaker] = i
70
+ # modify n_speakers
71
+ hps['data']["n_speakers"] = len(speakers)
72
+ # overwrite speaker names
73
+ hps['speakers'] = speaker2id
74
+ hps['train']['log_interval'] = 10
75
+ hps['train']['eval_interval'] = 100
76
+ hps['train']['batch_size'] = 16
77
+ hps['data']['training_files'] = "final_annotation_train.txt"
78
+ hps['data']['validation_files'] = "final_annotation_val.txt"
79
+ # save modified config
80
+ with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
81
+ json.dump(hps, f, indent=2)
82
+
83
+ # STEP 3: clean annotations, replace speaker names with assigned speaker IDs
84
+ import text
85
+ cleaned_new_annos = []
86
+ for i, line in enumerate(new_annos):
87
+ path, speaker, txt = line.split("|")
88
+ if len(txt) > 150:
89
+ continue
90
+ cleaned_text = text._clean_text(txt, hps['data']['text_cleaners'])
91
+ cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
92
+ cleaned_new_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
93
+ cleaned_old_annos = []
94
+ for i, line in enumerate(old_annos):
95
+ path, speaker, txt = line.split("|")
96
+ if len(txt) > 150:
97
+ continue
98
+ cleaned_text = text._clean_text(txt, hps['data']['text_cleaners'])
99
+ cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
100
+ cleaned_old_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
101
+ # merge with old annotation
102
+ final_annos = cleaned_old_annos + cc_duplicate * cleaned_new_annos
103
+ # save annotation file
104
+ with open("./final_annotation_train.txt", 'w', encoding='utf-8') as f:
105
+ for line in final_annos:
106
+ f.write(line)
107
+ # save annotation file for validation
108
+ with open("./final_annotation_val.txt", 'w', encoding='utf-8') as f:
109
+ for line in cleaned_new_annos:
110
+ f.write(line)
111
+ print("finished")
112
+ else:
113
+ # Do not add extra helper data
114
+ # STEP 1: modify config file
115
+ with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
116
+ hps = json.load(f)
117
+
118
+ # assign ids to new speakers
119
+ speaker2id = {}
120
+ for i, speaker in enumerate(speakers):
121
+ speaker2id[speaker] = i
122
+ # modify n_speakers
123
+ hps['data']["n_speakers"] = len(speakers)
124
+ # overwrite speaker names
125
+ hps['speakers'] = speaker2id
126
+ hps['train']['log_interval'] = 10
127
+ hps['train']['eval_interval'] = 100
128
+ hps['train']['batch_size'] = 16
129
+ hps['data']['training_files'] = "final_annotation_train.txt"
130
+ hps['data']['validation_files'] = "final_annotation_val.txt"
131
+ # save modified config
132
+ with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
133
+ json.dump(hps, f, indent=2)
134
+
135
+ # STEP 2: clean annotations, replace speaker names with assigned speaker IDs
136
+ import text
137
+
138
+ cleaned_new_annos = []
139
+ for i, line in enumerate(new_annos):
140
+ path, speaker, txt = line.split("|")
141
+ if len(txt) > 150:
142
+ continue
143
+ cleaned_text = text._clean_text(txt, hps['data']['text_cleaners']).replace("[ZH]", "")
144
+ cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
145
+ cleaned_new_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
146
+
147
+ final_annos = cleaned_new_annos
148
+ # save annotation file
149
+ with open("./final_annotation_train.txt", 'w', encoding='utf-8') as f:
150
+ for line in final_annos:
151
+ f.write(line)
152
+ # save annotation file for validation
153
+ with open("./final_annotation_val.txt", 'w', encoding='utf-8') as f:
154
+ for line in cleaned_new_annos:
155
+ f.write(line)
156
+ print("finished")
fastfinetuning_kr/short_audio_transcribe.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import os
3
+ import json
4
+ import torchaudio
5
+ import argparse
6
+ import torch
7
+
8
+ lang2token = {
9
+ 'zh': "[ZH]",
10
+ 'ja': "[JA]",
11
+ "en": "[EN]",
12
+ }
13
+ def transcribe_one(audio_path):
14
+ # load audio and pad/trim it to fit 30 seconds
15
+ audio = whisper.load_audio(audio_path)
16
+ audio = whisper.pad_or_trim(audio)
17
+
18
+ # make log-Mel spectrogram and move to the same device as the model
19
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
20
+
21
+ # detect the spoken language
22
+ _, probs = model.detect_language(mel)
23
+ print(f"Detected language: {max(probs, key=probs.get)}")
24
+ lang = max(probs, key=probs.get)
25
+ # decode the audio
26
+ options = whisper.DecodingOptions(beam_size=5)
27
+ result = whisper.decode(model, mel, options)
28
+
29
+ # print the recognized text
30
+ print(result.text)
31
+ return lang, result.text
32
+ if __name__ == "__main__":
33
+ parser = argparse.ArgumentParser()
34
+ parser.add_argument("--languages", default="CJE")
35
+ parser.add_argument("--whisper_size", default="medium")
36
+ args = parser.parse_args()
37
+ if args.languages == "CJE":
38
+ lang2token = {
39
+ 'zh': "[ZH]",
40
+ 'ja': "[JA]",
41
+ "en": "[EN]",
42
+ }
43
+ elif args.languages == "CJ":
44
+ lang2token = {
45
+ 'zh': "[ZH]",
46
+ 'ja': "[JA]",
47
+ }
48
+ elif args.languages == "C":
49
+ lang2token = {
50
+ 'zh': "[ZH]",
51
+ }
52
+ elif args.languages == "CJKE":
53
+ lang2token = {
54
+ 'zh': "[ZH]",
55
+ 'ja': "[JA]",
56
+ "en": "[EN]",
57
+ "ko": "[KO]",
58
+ }
59
+ assert (torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
60
+ model = whisper.load_model(args.whisper_size)
61
+ parent_dir = "./custom_character_voice/"
62
+ speaker_names = list(os.walk(parent_dir))[0][1]
63
+ speaker_annos = []
64
+ total_files = sum([len(files) for r, d, files in os.walk(parent_dir)])
65
+ # resample audios
66
+ # 2023/4/21: Get the target sampling rate
67
+ with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
68
+ hps = json.load(f)
69
+ target_sr = hps['data']['sampling_rate']
70
+ processed_files = 0
71
+ for speaker in speaker_names:
72
+ for i, wavfile in enumerate(list(os.walk(parent_dir + speaker))[0][2]):
73
+ # try to load file as audio
74
+ if wavfile.startswith("processed_"):
75
+ continue
76
+ try:
77
+ wav, sr = torchaudio.load(parent_dir + speaker + "/" + wavfile, frame_offset=0, num_frames=-1, normalize=True,
78
+ channels_first=True)
79
+ wav = wav.mean(dim=0).unsqueeze(0)
80
+ if sr != target_sr:
81
+ wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(wav)
82
+ if wav.shape[1] / sr > 20:
83
+ print(f"{wavfile} too long, ignoring\n")
84
+ save_path = parent_dir + speaker + "/" + f"processed_{i}.wav"
85
+ torchaudio.save(save_path, wav, target_sr, channels_first=True)
86
+ # transcribe text
87
+ lang, text = transcribe_one(save_path)
88
+ if lang not in list(lang2token.keys()):
89
+ print(f"{lang} not supported, ignoring\n")
90
+ continue
91
+ text = lang2token[lang] + text + lang2token[lang] + "\n"
92
+ speaker_annos.append(save_path + "|" + speaker + "|" + text)
93
+
94
+ processed_files += 1
95
+ print(f"Processed: {processed_files}/{total_files}")
96
+ except:
97
+ continue
98
+
99
+ # # clean annotation
100
+ # import argparse
101
+ # import text
102
+ # from utils import load_filepaths_and_text
103
+ # for i, line in enumerate(speaker_annos):
104
+ # path, sid, txt = line.split("|")
105
+ # cleaned_text = text._clean_text(txt, ["cjke_cleaners2"])
106
+ # cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
107
+ # speaker_annos[i] = path + "|" + sid + "|" + cleaned_text
108
+ # write into annotation
109
+ if len(speaker_annos) == 0:
110
+ print("Warning: no short audios found, this IS expected if you have only uploaded long audios, videos or video links.")
111
+ print("this IS NOT expected if you have uploaded a zip file of short audios. Please check your file structure or make sure your audio language is supported.")
112
+ with open("short_character_anno.txt", 'w', encoding='utf-8') as f:
113
+ for line in speaker_annos:
114
+ f.write(line)
115
+
116
+ # import json
117
+ # # generate new config
118
+ # with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
119
+ # hps = json.load(f)
120
+ # # modify n_speakers
121
+ # hps['data']["n_speakers"] = 1000 + len(speaker2id)
122
+ # # add speaker names
123
+ # for speaker in speaker_names:
124
+ # hps['speakers'][speaker] = speaker2id[speaker]
125
+ # # save modified config
126
+ # with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
127
+ # json.dump(hps, f, indent=2)
128
+ # print("finished")