File size: 2,602 Bytes
99bbd30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os
import torch
import torchaudio


#def normalize_wav(waveform):
#    waveform = waveform - torch.mean(waveform)
#    waveform = waveform / (torch.max(torch.abs(waveform[0, :])) + 1e-8)
#    return waveform * 0.5

def normalize_wav(waveform, waveform_ref):
    waveform = waveform / (torch.max(torch.abs(waveform))) * (torch.max(torch.abs(waveform_ref)))
    return waveform


with open("/ailab-train/speech/zhanghaomin/codes3/F5-TTS-main/data/v2c_test.lst", "r") as fr:
    lines = fr.readlines()


v2a_path = "/ailab-train/speech/zhanghaomin/codes3/MMAudio-main/output_v2c_neg/"
output_dir = "outputs_v2a/"


if not os.path.exists(output_dir):
    os.makedirs(output_dir)
if not os.path.exists(output_dir+"/ref/"):
    os.makedirs(output_dir+"/ref/")
if not os.path.exists(output_dir+"/gen/"):
    os.makedirs(output_dir+"/gen/")
if not os.path.exists(output_dir+"/tgt/"):
    os.makedirs(output_dir+"/tgt/")


for idx, line in enumerate(lines):
    wav_p, video_p, txt_p, wav, video, txt = line.strip().split("\t")

    v2a_audio = v2a_path + video.replace("/", "__") + ".flac"
    v2a_audio_p = v2a_path + video_p.replace("/", "__") + ".flac"

    waveform, sr = torchaudio.load(wav)
    if sr != 24000:
        waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=24000)
    waveform_p, sr = torchaudio.load(wav_p)
    if sr != 24000:
        waveform_p = torchaudio.functional.resample(waveform_p, orig_freq=sr, new_freq=24000)
    waveform_v2a, sr = torchaudio.load(v2a_audio)
    if sr != 24000:
        waveform_v2a = torchaudio.functional.resample(waveform_v2a, orig_freq=sr, new_freq=24000)

    torchaudio.save(output_dir+"/ref/"+str(idx).zfill(8)+".wav", waveform_p[0:1,:], 24000)
    torchaudio.save(output_dir+"/gen/"+str(idx).zfill(8)+".wav", normalize_wav(waveform_v2a[0:1,:], waveform_p[0:1,:]), 24000)
    torchaudio.save(output_dir+"/tgt/"+str(idx).zfill(8)+".wav", waveform[0:1,:], 24000)

    if not os.path.exists(output_dir+"/ref_nonorm/"):
        os.makedirs(output_dir+"/ref_nonorm/")
    if not os.path.exists(output_dir+"/gen_nonorm/"):
        os.makedirs(output_dir+"/gen_nonorm/")
    if not os.path.exists(output_dir+"/tgt_nonorm/"):
        os.makedirs(output_dir+"/tgt_nonorm/")
    torchaudio.save(output_dir+"/ref_nonorm/"+str(idx).zfill(8)+".wav", waveform_p[0:1,:], 24000)
    torchaudio.save(output_dir+"/gen_nonorm/"+str(idx).zfill(8)+".wav", waveform_v2a[0:1,:], 24000)
    torchaudio.save(output_dir+"/tgt_nonorm/"+str(idx).zfill(8)+".wav", waveform[0:1,:], 24000)