LiteRT
File size: 5,056 Bytes
a5c5b03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import numpy as np
import torch
import glob
import os
import tqdm
import librosa
import parselmouth
from utils.commons.pitch_utils import f0_to_coarse
from utils.commons.multiprocess_utils import multiprocess_run_tqdm
from utils.commons.os_utils import multiprocess_glob
from utils.audio.io import save_wav

from moviepy.editor import VideoFileClip
from utils.commons.hparams import hparams, set_hparams

def resample_wav(wav_name, out_name, sr=16000):
    wav_raw, sr = librosa.core.load(wav_name, sr=sr)
    save_wav(wav_raw, out_name, sr)
    
def split_wav(mp4_name, wav_name=None):
    if wav_name is None:
        wav_name = mp4_name.replace(".mp4", ".wav").replace("/video/", "/audio/")
    if os.path.exists(wav_name):
        return wav_name
    os.makedirs(os.path.dirname(wav_name), exist_ok=True)
    
    video = VideoFileClip(mp4_name,verbose=False)
    dur = video.duration
    audio = video.audio 
    assert audio is not None
    audio.write_audiofile(wav_name,fps=16000,verbose=False,logger=None)
    return wav_name

def librosa_pad_lr(x, fsize, fshift, pad_sides=1):
    '''compute right padding (final frame) or both sides padding (first and final frames)
    '''
    assert pad_sides in (1, 2)
    # return int(fsize // 2)
    pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0]
    if pad_sides == 1:
        return 0, pad
    else:
        return pad // 2, pad // 2 + pad % 2

def extract_mel_from_fname(wav_path,
                      fft_size=512,
                      hop_size=320,
                      win_length=512,
                      window="hann",
                      num_mels=80,
                      fmin=80,
                      fmax=7600,
                      eps=1e-6,
                      sample_rate=16000,
                      min_level_db=-100):
    if isinstance(wav_path, str):
        wav, _ = librosa.core.load(wav_path, sr=sample_rate)
    else:
        wav = wav_path

    # get amplitude spectrogram
    x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
                          win_length=win_length, window=window, center=False)
    spc = np.abs(x_stft)  # (n_bins, T)

    # get mel basis
    fmin = 0 if fmin == -1 else fmin
    fmax = sample_rate / 2 if fmax == -1 else fmax
    mel_basis = librosa.filters.mel(sr=sample_rate, n_fft=fft_size, n_mels=num_mels, fmin=fmin, fmax=fmax)
    mel = mel_basis @ spc

    mel = np.log10(np.maximum(eps, mel))  # (n_mel_bins, T)
    mel = mel.T

    l_pad, r_pad = librosa_pad_lr(wav, fft_size, hop_size, 1)
    wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)

    return wav.T, mel

def extract_f0_from_wav_and_mel(wav, mel,
                        hop_size=320,
                        audio_sample_rate=16000,
                        ):
    time_step = hop_size / audio_sample_rate * 1000
    f0_min = 80
    f0_max = 750
    f0 = parselmouth.Sound(wav, audio_sample_rate).to_pitch_ac(
        time_step=time_step / 1000, voicing_threshold=0.6,
        pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']

    delta_l = len(mel) - len(f0)
    assert np.abs(delta_l) <= 8
    if delta_l > 0:
        f0 = np.concatenate([f0, [f0[-1]] * delta_l], 0)
    f0 = f0[:len(mel)]
    pitch_coarse = f0_to_coarse(f0)
    return f0, pitch_coarse


def extract_mel_f0_from_fname(wav_name=None, out_name=None):
    try:
        out_name = wav_name.replace(".wav", "_mel_f0.npy").replace("/audio/", "/mel_f0/")
        os.makedirs(os.path.dirname(out_name), exist_ok=True)

        wav, mel = extract_mel_from_fname(wav_name)
        f0, f0_coarse = extract_f0_from_wav_and_mel(wav, mel)
        out_dict = {
            "mel": mel, # [T, 80]
            "f0": f0,
        }
        np.save(out_name, out_dict)
    except Exception as e:
        print(e)

def extract_mel_f0_from_video_name(mp4_name, wav_name=None, out_name=None):
    if mp4_name.endswith(".mp4"):
        wav_name = split_wav(mp4_name, wav_name)
        if out_name is None:
            out_name = mp4_name.replace(".mp4", "_mel_f0.npy").replace("/video/", "/mel_f0/")
    elif mp4_name.endswith(".wav"):
        wav_name = mp4_name
        if out_name is None:
            out_name = mp4_name.replace(".wav", "_mel_f0.npy").replace("/audio/", "/mel_f0/")

    os.makedirs(os.path.dirname(out_name), exist_ok=True)

    wav, mel = extract_mel_from_fname(wav_name)

    f0, f0_coarse = extract_f0_from_wav_and_mel(wav, mel)
    out_dict = {
        "mel": mel, # [T, 80]
        "f0": f0,
    }
    np.save(out_name, out_dict)


if __name__ == '__main__':
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument('--video_id', type=str, default='May', help='')
    args = parser.parse_args()
    ### Process Single Long Audio for NeRF dataset
    person_id = args.video_id

    wav_16k_name = f"data/processed/videos/{person_id}/aud.wav"
    out_name = f"data/processed/videos/{person_id}/aud_mel_f0.npy"
    extract_mel_f0_from_video_name(wav_16k_name, out_name)
    print(f"Saved at {out_name}")