Commit
·
d57f9a7
1
Parent(s):
e059497
More duration normalization
Browse files- .gitignore +5 -2
- inference.py +25 -8
- run.ipynb +0 -0
.gitignore
CHANGED
|
@@ -1,5 +1,8 @@
|
|
| 1 |
-
__pycache__/inference.cpython-311.pyc
|
| 2 |
-
__pycache__/models.cpython-311.pyc
|
| 3 |
Modules/__pycache__/__init__.cpython-311.pyc
|
| 4 |
Modules/__pycache__/hifigan.cpython-311.pyc
|
| 5 |
Modules/__pycache__/utils.cpython-311.pyc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
Modules/__pycache__/__init__.cpython-311.pyc
|
| 2 |
Modules/__pycache__/hifigan.cpython-311.pyc
|
| 3 |
Modules/__pycache__/utils.cpython-311.pyc
|
| 4 |
+
Modules/__pycache__/__init__.cpython-311.pyc
|
| 5 |
+
Modules/__pycache__/hifigan.cpython-311.pyc
|
| 6 |
+
Modules/__pycache__/utils.cpython-311.pyc
|
| 7 |
+
__pycache__/inference.cpython-311.pyc
|
| 8 |
+
__pycache__/models.cpython-311.pyc
|
inference.py
CHANGED
|
@@ -149,6 +149,22 @@ class StyleTTS2(torch.nn.Module):
|
|
| 149 |
def replacement(match):
|
| 150 |
return next(replacement_iter)
|
| 151 |
return replacement
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
def __load_models(self, models_path):
|
| 154 |
module_params = []
|
|
@@ -180,7 +196,7 @@ class StyleTTS2(torch.nn.Module):
|
|
| 180 |
device = self.get_device.device
|
| 181 |
denoise = min(denoise, 1)
|
| 182 |
if split_dur != 0: split_dur = max(int(split_dur), 1)
|
| 183 |
-
max_samples = 24000*
|
| 184 |
print("Computing the style for:", path)
|
| 185 |
|
| 186 |
wave, sr = librosa.load(path, sr=24000)
|
|
@@ -248,11 +264,12 @@ class StyleTTS2(torch.nn.Module):
|
|
| 248 |
duration = self.predictor.duration_proj(x) / speed
|
| 249 |
duration = torch.sigmoid(duration).sum(axis=-1)
|
| 250 |
|
| 251 |
-
if prev_d_mean != 0:#Stabilize speaking speed
|
| 252 |
dur_stats = torch.empty(duration.shape).normal_(mean=prev_d_mean, std=duration.std()).to(device)
|
| 253 |
else:
|
| 254 |
dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device)
|
| 255 |
duration = duration*(1-t) + dur_stats*t
|
|
|
|
| 256 |
|
| 257 |
pred_dur = torch.round(duration.squeeze()).clamp(min=1)
|
| 258 |
pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
|
|
@@ -272,7 +289,7 @@ class StyleTTS2(torch.nn.Module):
|
|
| 272 |
return out.squeeze().cpu().numpy(), duration.mean()
|
| 273 |
|
| 274 |
def get_styles(self, speakers, denoise=0.3, avg_style=True):
|
| 275 |
-
if avg_style: split_dur =
|
| 276 |
else: split_dur = 0
|
| 277 |
styles = {}
|
| 278 |
for id in speakers:
|
|
@@ -285,9 +302,9 @@ class StyleTTS2(torch.nn.Module):
|
|
| 285 |
}
|
| 286 |
return styles
|
| 287 |
|
| 288 |
-
def generate(self, text, styles, stabilize=
|
| 289 |
-
if stabilize:
|
| 290 |
-
else:
|
| 291 |
|
| 292 |
list_wav = []
|
| 293 |
prev_d_mean = 0
|
|
@@ -342,10 +359,10 @@ class StyleTTS2(torch.nn.Module):
|
|
| 342 |
phonem = espeak_phn(sentence, styles[speaker_id]['lang'])
|
| 343 |
phonem = re.sub(lang_pattern, replacement_func, phonem)
|
| 344 |
|
| 345 |
-
wav, prev_d_mean = self.__inference(phonem, current_ref_s, speed=speed, prev_d_mean=prev_d_mean, t=
|
| 346 |
wav = wav[4000:-4000] #Remove weird pulse and silent tokens
|
| 347 |
list_wav.append(wav)
|
| 348 |
|
| 349 |
final_wav = np.concatenate(list_wav)
|
| 350 |
-
final_wav = np.concatenate([np.zeros([
|
| 351 |
return final_wav
|
|
|
|
| 149 |
def replacement(match):
|
| 150 |
return next(replacement_iter)
|
| 151 |
return replacement
|
| 152 |
+
|
| 153 |
+
def __replace_outliers_zscore(self, tensor, threshold=3.0, factor=0.95):
|
| 154 |
+
mean = tensor.mean()
|
| 155 |
+
std = tensor.std()
|
| 156 |
+
z = (tensor - mean) / std
|
| 157 |
+
|
| 158 |
+
# Identify outliers
|
| 159 |
+
outlier_mask = torch.abs(z) > threshold
|
| 160 |
+
# Compute replacement value, respecting sign
|
| 161 |
+
sign = torch.sign(tensor - mean)
|
| 162 |
+
replacement = mean + sign * (threshold * std * factor)
|
| 163 |
+
|
| 164 |
+
result = tensor.clone()
|
| 165 |
+
result[outlier_mask] = replacement[outlier_mask]
|
| 166 |
+
|
| 167 |
+
return result
|
| 168 |
|
| 169 |
def __load_models(self, models_path):
|
| 170 |
module_params = []
|
|
|
|
| 196 |
device = self.get_device.device
|
| 197 |
denoise = min(denoise, 1)
|
| 198 |
if split_dur != 0: split_dur = max(int(split_dur), 1)
|
| 199 |
+
max_samples = 24000*20 #max 20 seconds ref audio
|
| 200 |
print("Computing the style for:", path)
|
| 201 |
|
| 202 |
wave, sr = librosa.load(path, sr=24000)
|
|
|
|
| 264 |
duration = self.predictor.duration_proj(x) / speed
|
| 265 |
duration = torch.sigmoid(duration).sum(axis=-1)
|
| 266 |
|
| 267 |
+
if prev_d_mean != 0:#Stabilize speaking speed between splits
|
| 268 |
dur_stats = torch.empty(duration.shape).normal_(mean=prev_d_mean, std=duration.std()).to(device)
|
| 269 |
else:
|
| 270 |
dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device)
|
| 271 |
duration = duration*(1-t) + dur_stats*t
|
| 272 |
+
duration[:,1:-2] = self.__replace_outliers_zscore(duration[:,1:-2]) #Normalize outlier
|
| 273 |
|
| 274 |
pred_dur = torch.round(duration.squeeze()).clamp(min=1)
|
| 275 |
pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
|
|
|
|
| 289 |
return out.squeeze().cpu().numpy(), duration.mean()
|
| 290 |
|
| 291 |
def get_styles(self, speakers, denoise=0.3, avg_style=True):
|
| 292 |
+
if avg_style: split_dur = 2
|
| 293 |
else: split_dur = 0
|
| 294 |
styles = {}
|
| 295 |
for id in speakers:
|
|
|
|
| 302 |
}
|
| 303 |
return styles
|
| 304 |
|
| 305 |
+
def generate(self, text, styles, stabilize=True, n_merge=16, default_speaker= "[id_1]"):
|
| 306 |
+
if stabilize: smooth_value=0.2
|
| 307 |
+
else: smooth_value=0
|
| 308 |
|
| 309 |
list_wav = []
|
| 310 |
prev_d_mean = 0
|
|
|
|
| 359 |
phonem = espeak_phn(sentence, styles[speaker_id]['lang'])
|
| 360 |
phonem = re.sub(lang_pattern, replacement_func, phonem)
|
| 361 |
|
| 362 |
+
wav, prev_d_mean = self.__inference(phonem, current_ref_s, speed=speed, prev_d_mean=prev_d_mean, t=smooth_value)
|
| 363 |
wav = wav[4000:-4000] #Remove weird pulse and silent tokens
|
| 364 |
list_wav.append(wav)
|
| 365 |
|
| 366 |
final_wav = np.concatenate(list_wav)
|
| 367 |
+
final_wav = np.concatenate([np.zeros([4000]), final_wav, np.zeros([4000])], axis=0) # add padding
|
| 368 |
return final_wav
|
run.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|