Audionar long form

Browse files

Files changed (6) hide show

README.md +3 -3
Utils/text_utils.py +5 -17
audiobook.py +2 -1
correct_figure.py +0 -378
demo.py +4 -4
visualize_per_sentence.py +0 -251

README.md CHANGED Viewed

@@ -14,13 +14,13 @@ tags:
 - mimic3
 ---
-Audionar - StyleTTS2 of speakers pregenerated by another TTS
 [![Beta Text 2 Speech Tool](assets/shift_banner.png?raw=true)](https://shift-europe.eu/)
 ##
-# SHIFT TTS / AudioGen
 Phonetic variation of [SHIFT TTS](https://audeering.github.io/shift/) blend to [AudioGen soundscapes](https://huggingface.co/dkounadis/artificial-styletts2/discussions/3)
   - [Analysis of emotion of SHIFT TTS](https://huggingface.co/dkounadis/artificial-styletts2/discussions/2)
@@ -82,7 +82,7 @@ python tts.py --text assets/ocr.txt --image assets/ocr.jpg --soundscape "battle
 ## Landscape 2 Soundscapes
-The following needs `api.py` to be already running on a tmux session.
 ```python
 # TTS & soundscape - output .mp4 saved in ./out/

 - mimic3
 ---
+Audionar - Phonetic Variation of StyleTTS2 blend to AudioGen SoundScapes
 [![Beta Text 2 Speech Tool](assets/shift_banner.png?raw=true)](https://shift-europe.eu/)
 ##
+# SHIFT TTS / Audionar
 Phonetic variation of [SHIFT TTS](https://audeering.github.io/shift/) blend to [AudioGen soundscapes](https://huggingface.co/dkounadis/artificial-styletts2/discussions/3)
   - [Analysis of emotion of SHIFT TTS](https://huggingface.co/dkounadis/artificial-styletts2/discussions/2)
 ## Landscape 2 Soundscapes
+The following needs `api.py` to be already running on another terminal
 ```python
 # TTS & soundscape - output .mp4 saved in ./out/

Utils/text_utils.py CHANGED Viewed

@@ -5,6 +5,10 @@ import textwrap
 from num2words import num2words
 # IPA Phonemizer: https://github.com/bootphon/phonemizer
 import nltk
 _pad = "$"
 _punctuation = ';:,.!?¡¿—…"«»“” '
 _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
@@ -35,23 +39,7 @@ class TextCleaner:
 def split_into_sentences(text, max_len=120):
     sentences = nltk.sent_tokenize(text)
-    limited_sentences = []
-    for sentence in sentences:
-        if len(sentence) <= max_len:
-            limited_sentences.append(sentence)
-        else:
-            # If a sentence is too long, try to split it more intelligently
-            current_chunk = ""
-            words = sentence.split()
-            for word in words:
-                if len(current_chunk) + len(word) + 1 <= max_len: # +1 for space
-                    current_chunk += (word + " ").strip()
-                else:
-                    limited_sentences.append(current_chunk.strip())
-                    current_chunk = (word + " ").strip()
-            if current_chunk: # Add any remaining part
-                limited_sentences.append(current_chunk.strip())
     return limited_sentences

 from num2words import num2words
 # IPA Phonemizer: https://github.com/bootphon/phonemizer
 import nltk
+#nltk.download('punkt', download_dir='./')
+#nltk.download('punkt_tab', download_dir='./')
+nltk.data.path.append('.')
 _pad = "$"
 _punctuation = ';:,.!?¡¿—…"«»“” '
 _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
 def split_into_sentences(text, max_len=120):
     sentences = nltk.sent_tokenize(text)
+    limited_sentences = [i for sent in sentences for i in textwrap.wrap(sent, width=max_len)]
     return limited_sentences

audiobook.py CHANGED Viewed

@@ -194,7 +194,8 @@ for vox in voices:
     # SILENT CLIP
     clip_silent = ImageClip(STATIC_FRAME).set_duration(5)  # as long as the audio - TTS first
-    clip_silent.write_videofile(SILENT_VIDEO, fps=24)

     # SILENT CLIP
     clip_silent = ImageClip(STATIC_FRAME).set_duration(5)  # as long as the audio - TTS first
+    clip_silent.fps = 24
+    clip_silent.write_videofile(SILENT_VIDEO)

correct_figure.py DELETED Viewed

@@ -1,378 +0,0 @@
-# we have to evaluate emotion & cer per sentence -> not audinterface sliding window
-import os
-import audresample
-import torch
-import matplotlib.pyplot as plt
-import soundfile
-import json
-import audb
-from transformers import AutoModelForAudioClassification
-from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel
-import types
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
-import pandas as pd
-import json
-import numpy as np
-from pathlib import Path
-import transformers
-import torch
-import audmodel
-import audiofile
-import jiwer
-# https://arxiv.org/pdf/2407.12229
-#  https://arxiv.org/pdf/2312.05187
-# https://arxiv.org/abs/2407.05407
-# https://arxiv.org/pdf/2408.06577
-# https://arxiv.org/pdf/2309.07405
-import msinference
-import os
-from random import shuffle
-config = transformers.Wav2Vec2Config() #finetuning_task='spef2feat_reg')
-config.dev = torch.device('cuda:0')
-config.dev2 = torch.device('cuda:0')
-LABELS = ['arousal', 'dominance', 'valence',
-           'Angry',
-           'Sad',
-           'Happy',
-           'Surprise',
-            'Fear',
-            'Disgust',
-            'Contempt',
-            'Neutral'
-            ]
-config = transformers.Wav2Vec2Config() #finetuning_task='spef2feat_reg')
-config.dev = torch.device('cuda:0')
-config.dev2 = torch.device('cuda:0')
-    # https://arxiv.org/pdf/2407.12229
-    #  https://arxiv.org/pdf/2312.05187
-    # https://arxiv.org/abs/2407.05407
-    # https://arxiv.org/pdf/2408.06577
-    # https://arxiv.org/pdf/2309.07405
-def _infer(self, x):
-    '''x: (batch, audio-samples-16KHz)'''
-    x = (x + self.config.mean) / self.config.std  # plus
-    x = self.ssl_model(x, attention_mask=None).last_hidden_state
-    # pool
-    h = self.pool_model.sap_linear(x).tanh()
-    w = torch.matmul(h, self.pool_model.attention)
-    w = w.softmax(1)
-    mu = (x * w).sum(1)
-    x = torch.cat(
-        [
-            mu,
-            ((x * x * w).sum(1) - mu * mu).clamp(min=1e-7).sqrt()
-        ], 1)
-    return self.ser_model(x)
-teacher_cat = AutoModelForAudioClassification.from_pretrained(
-    '3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes',
-    trust_remote_code=True  # fun definitions see 3loi/SER-.. repo
-).to(config.dev2).eval()
-teacher_cat.forward = types.MethodType(_infer, teacher_cat)
-# ===================[:]===================== Dawn
-def _prenorm(x, attention_mask=None):
-    '''mean/var'''
-    if attention_mask is not None:
-        N = attention_mask.sum(1, keepdim=True)  # here attn msk is unprocessed just the original input
-        x -= x.sum(1, keepdim=True) / N
-        var = (x * x).sum(1, keepdim=True) / N
-    else:
-        x -= x.mean(1, keepdim=True)  # mean is an onnx operator reducemean saves some ops compared to casting integer N to float and the div
-        var = (x * x).mean(1, keepdim=True)
-    return x / torch.sqrt(var + 1e-7)
-from torch import nn
-from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel, Wav2Vec2Model
-class RegressionHead(nn.Module):
-        r"""Classification head."""
-        def __init__(self, config):
-            super().__init__()
-            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-            self.dropout = nn.Dropout(config.final_dropout)
-            self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
-        def forward(self, features, **kwargs):
-            x = features
-            x = self.dropout(x)
-            x = self.dense(x)
-            x = torch.tanh(x)
-            x = self.dropout(x)
-            x = self.out_proj(x)
-            return x
-class Dawn(Wav2Vec2PreTrainedModel):
-    r"""Speech emotion classifier."""
-    def __init__(self, config):
-        super().__init__(config)
-        self.config = config
-        self.wav2vec2 = Wav2Vec2Model(config)
-        self.classifier = RegressionHead(config)
-        self.init_weights()
-    def forward(
-            self,
-            input_values,
-            attention_mask=None,
-    ):
-        x = _prenorm(input_values, attention_mask=attention_mask)
-        outputs = self.wav2vec2(x, attention_mask=attention_mask)
-        hidden_states = outputs[0]
-        hidden_states = torch.mean(hidden_states, dim=1)
-        logits = self.classifier(hidden_states)
-        return logits
-        # return {'hidden_states': hidden_states,
-        #         'logits': logits}
-dawn = Dawn.from_pretrained('audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim').to(config.dev).eval()
-# =======================================
-torch_dtype = torch.float16 #if torch.cuda.is_available() else torch.float32
-model_id = "openai/whisper-large-v3"
-model = AutoModelForSpeechSeq2Seq.from_pretrained(
-    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
-).to(config.dev)
-processor = AutoProcessor.from_pretrained(model_id)
-_pipe = pipeline(
-    "automatic-speech-recognition",
-    model=model,
-    tokenizer=processor.tokenizer,
-    feature_extractor=processor.feature_extractor,
-    max_new_tokens=128,
-    chunk_length_s=30,
-    batch_size=16,
-    return_timestamps=True,
-    torch_dtype=torch_dtype,
-    device=config.dev,
-)
-def process_function(x, sampling_rate, idx):
-    # x = x[None , :]  ASaHSuFDCN
-    #  {0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise',
-    #  4: 'Fear', 5: 'Disgust', 6: 'Contempt', 7: 'Neutral'}
-    #tensor([[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]])
-    logits_cat = teacher_cat(torch.from_numpy(x).to(config.dev)).softmax(1)
-    logits_adv = dawn(torch.from_numpy(x).to(config.dev))
-    out = torch.cat([logits_adv,
-                            logits_cat],
-                            1).cpu().detach().numpy()
-    # print(out.shape)
-    return out[0, :]
-def load_speech(split=None):
-    DB = [
-        # [dataset, version, table, has_timdeltas_or_is_full_wavfile]
-          #  ['crema-d', '1.1.1', 'emotion.voice.test', False],
-        #['librispeech', '3.1.0', 'test-clean', False],
-            ['emodb',  '1.2.0', 'emotion.categories.train.gold_standard', False],
-  #          ['entertain-playtestcloud', '1.1.0', 'emotion.categories.train.gold_standard', True],
-   #         ['erik', '2.2.0', 'emotion.categories.train.gold_standard', True],
-    #        ['meld', '1.3.1', 'emotion.categories.train.gold_standard', False],
-            # ['msppodcast', '5.0.0', 'emotion.categories.train.gold_standard', False],  # tandalone bucket because it has gt labels?
-     #       ['myai', '1.0.1', 'emotion.categories.train.gold_standard', False],
-      #      ['casia', None, 'emotion.categories.gold_standard', False],
-            # ['switchboard-1', None, 'sentiment', True],
-            # ['swiss-parliament', None, 'segments', True],
-            # ['argentinian-parliament', None, 'segments', True],
-            # ['austrian-parliament', None, 'segments', True],
-            # #'german', --> bundestag
-            # ['brazilian-parliament', None, 'segments', True],
-            # ['mexican-parliament', None, 'segments', True],
-            # ['portuguese-parliament', None, 'segments', True],
-       #     ['spanish-parliament', None, 'segments', True],
-        #    ['chinese-vocal-emotions-liu-pell', None, 'emotion.categories.desired', False],
-            # peoples-speech slow
-         #   ['peoples-speech', None, 'train-initial', False]
-    ]
-    output_list = []
-    for database_name, ver, table, has_timedeltas in DB:
-        a = audb.load(database_name,
-                        sampling_rate=16000,
-                        format='wav',
-                        mixdown=True,
-                        version=ver,
-                        cache_root='/cache/audb/')
-        a = a[table].get()
-        if has_timedeltas:
-            print(f'{has_timedeltas=}')
-            # a = a.reset_index()[['file', 'start', 'end']]
-            # output_list += [[*t] for t
-            #         in zip(a.file.values, a.start.dt.total_seconds().values, a.end.dt.total_seconds().values)]
-        else:
-            output_list += [f for f in a.index]  # use file (no timedeltas)
-    return output_list
-natural_wav_paths = load_speech()
-with open('harvard.json', 'r') as f:
-    harvard_individual_sentences = json.load(f)['sentences']
-synthetic_wav_paths = ['./enslow/' + i for i in
-                       os.listdir('./enslow/')]
-synthetic_wav_paths_4x = ['./style_vector_v2/' + i for i in
-                    os.listdir('./style_vector_v2/')]
-synthetic_wav_paths_foreign = ['./mimic3_foreign/' + i for i in os.listdir('./mimic3_foreign/') if 'en_U' not in i]
-synthetic_wav_paths_foreign_4x = ['./mimic3_foreign_4x/' + i for i in os.listdir('./mimic3_foreign_4x/') if 'en_U' not in i]  # very short segments
-# filter very short styles
-synthetic_wav_paths_foreign = [i for i in synthetic_wav_paths_foreign if audiofile.duration(i) > 2]
-synthetic_wav_paths_foreign_4x = [i for i in synthetic_wav_paths_foreign_4x if audiofile.duration(i) > 2]
-synthetic_wav_paths = [i for i in synthetic_wav_paths if audiofile.duration(i) > 2]
-synthetic_wav_pathsn_4x = [i for i in synthetic_wav_paths_4x if audiofile.duration(i) > 2]
-shuffle(synthetic_wav_paths_foreign_4x)
-shuffle(synthetic_wav_paths_foreign)
-shuffle(synthetic_wav_paths)
-shuffle(synthetic_wav_paths_4x)
-print(len(synthetic_wav_paths_foreign_4x), len(synthetic_wav_paths_foreign),
-      len(synthetic_wav_paths), len(synthetic_wav_paths_4x))  # 134 204 134 204
-for audio_prompt in ['english',
-                     'english_4x',
-                     'human',
-                     'foreign',
-                     'foreign_4x']:   # each of these creates a separate pkl - so outer for
-    #
-    data = np.zeros((770, len(LABELS)*2 + 2))  # 768 x LABELS-prompt & LABELS-stts2 & cer-prompt & cer-stts2
-    #
-    OUT_FILE = f'{audio_prompt}_analytic.pkl'
-    if not os.path.isfile(OUT_FILE):
-        ix = 0
-        for list_of_10 in harvard_individual_sentences[:10004]:
-                # long_sentence = ' '.join(list_of_10['sentences'])
-                # harvard.append(long_sentence.replace('.', ' '))
-                for text in list_of_10['sentences']:
-                    if audio_prompt == 'english':
-                        _p = synthetic_wav_paths[ix % len(synthetic_wav_paths)]
-                        #  134
-                        style_vec = msinference.compute_style(_p)
-                    elif audio_prompt == 'english_4x':
-                        _p = synthetic_wav_paths_4x[ix % len(synthetic_wav_paths_4x)]
-                        # 134]
-                        style_vec = msinference.compute_style(_p)
-                    elif audio_prompt == 'human':
-                        _p = natural_wav_paths[ix % len(natural_wav_paths)]
-                        # ?
-                        style_vec = msinference.compute_style(_p)
-                    elif audio_prompt == 'foreign':
-                        _p = synthetic_wav_paths_foreign[ix % len(synthetic_wav_paths_foreign)]
-                        # 204 some short styles are discarded ~ 1180
-                        style_vec = msinference.compute_style(_p)
-                    elif audio_prompt == 'foreign_4x':
-                        _p = synthetic_wav_paths_foreign_4x[ix % len(synthetic_wav_paths_foreign_4x)]
-                        # 174
-                        style_vec = msinference.compute_style(_p)
-                    else:
-                        print('unknonw list of style vector')
-                    x = msinference.inference(text,
-                                                style_vec,
-                                                alpha=0.3,
-                                                beta=0.7,
-                                                diffusion_steps=7,
-                                                embedding_scale=1)
-                    x = audresample.resample(x, 24000, 16000)
-                    _st, fsr = audiofile.read(_p)
-                    _st = audresample.resample(_st, fsr, 16000)
-                    print(_st.shape, x.shape)
-                    emotion_of_prompt = process_function(_st, 16000, None)
-                    emotion_of_out = process_function(x, 16000, None)
-                    data[ix, :11] = emotion_of_prompt
-                    data[ix, 11:22] = emotion_of_out
-                    # 2 last columns is cer-prompt cer-styletts2
-                    transcription_prompt = _pipe(_st[0])
-                    transcription_styletts2 = _pipe(x[0])  # allow singleton for EMO process func
-                    # print(len(emotion_of_prompt + emotion_of_out), ix, text)
-                    print(transcription_prompt, transcription_styletts2)
-                    data[ix, 22] = jiwer.cer('Sweet dreams are made of this. I travel the world and the seven seas.',
-                                       transcription_prompt['text'])
-                    data[ix, 23] = jiwer.cer(text,
-                                       transcription_styletts2['text'])
-                    print(data[ix, :])
-                    ix += 1
-        df = pd.DataFrame(data, columns=['prompt-' + i for i in LABELS] + ['styletts2-' + i for i in LABELS] + ['cer-prompt', 'cer-styletts2'])
-        df.to_pickle(OUT_FILE)
-    else:
-        df = pd.read_pickle(OUT_FILE)
-        print('\nALREADY EXISTS\n{df}')
-#  From the pickle we should also run cer and whisper on every prompt

demo.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import numpy as np
 import soundfile
-import msinference  # If using api.py/live_demo.py instead of this demo.py has also split into sentences for long form text OOM
-from audiocraft.builders import AudioGen  # has custom accelerations for long form text - needs 14 GB of cuda
 def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
               voice='en_US/m-ailabs_low#mary_ann', # Listen to voices https://huggingface.co/dkounadis/artificial-styletts2/discussions/1
@@ -29,10 +29,10 @@ def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are
         x = msinference.foreign(text=text, lang=voice)
-    x /= 1.02 * np.abs(x).max() + 1e-7  # volume amplify full [-1,1]
     if soundscape is not None:
         sound_gen = AudioGen().to('cuda:0').eval()
-        background = sound_gen.generate(soundscape, duration=len(x)/16000 + .74,  # sound duration seconds
                                               ).detach().cpu().numpy()
         x = .6 * x + .4 * background[:len(x)]
     return x

 import numpy as np
 import soundfile
+import msinference  # Prefer live_demo.py instead as this demo.py has no split to sentences to prevent OOM
+from audiocraft.builders import AudioGen  # fixed bug for repeated calls
 def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
               voice='en_US/m-ailabs_low#mary_ann', # Listen to voices https://huggingface.co/dkounadis/artificial-styletts2/discussions/1
         x = msinference.foreign(text=text, lang=voice)
+    x /= 1.02 * np.abs(x).max() + 1e-7  # volume amplify to [-1,1]
     if soundscape is not None:
         sound_gen = AudioGen().to('cuda:0').eval()
+        background = sound_gen.generate(soundscape, duration=len(x)/16000 + .74,  # sound duration in seconds
                                               ).detach().cpu().numpy()
         x = .6 * x + .4 * background[:len(x)]
     return x

visualize_per_sentence.py DELETED Viewed

@@ -1,251 +0,0 @@
-# PREREQUISITY
-# correct_figure.py -> makes analytic.pkl & CER -> per sentence No Audinterface sliding window
-import pandas as pd
-import os
-import numpy as np
-from pathlib import Path
-import matplotlib.pyplot as plt
-import audiofile
-columns = ['prompt-arousal',
-           'prompt-dominance',
-           'prompt-valence',
-           'prompt-Angry',
-           'prompt-Sad',
-           'prompt-Happy',
-           'prompt-Surprise',
-           'prompt-Fear',
-           'prompt-Disgust',
-           'prompt-Contempt',
-           'prompt-Neutral',
-           'styletts2-arousal',
-           'styletts2-dominance',
-           'styletts2-valence',
-           'styletts2-Angry',
-           'styletts2-Sad',
-           'styletts2-Happy',
-           'styletts2-Surprise',
-           'styletts2-Fear',
-           'styletts2-Disgust',
-           'styletts2-Contempt',
-           'styletts2-Neutral',
-           'cer-prompt',
-           'cer-styletts2']
-FULL_PKL = ['english_4x_analytic.pkl',
-         'english_analytic.pkl',
-         'foreign_4x_analytic.pkl',
-         'foreign_analytic.pkl',
-         'human_analytic.pkl']
-# -------------------------------------------
-LABELS = ['arousal', 'dominance', 'valence',
-        #    'speech_synthesizer', 'synthetic_singing',
-           'Angry',
-           'Sad',
-           'Happy',
-           'Surprise',
-            'Fear',
-            'Disgust',
-            'Contempt',
-            'Neutral'
-            ]
-    # https://arxiv.org/pdf/2407.12229
-    #  https://arxiv.org/pdf/2312.05187
-    # https://arxiv.org/abs/2407.05407
-    # https://arxiv.org/pdf/2408.06577
-    # https://arxiv.org/pdf/2309.07405
-preds  = {}
-for file_interface in FULL_PKL:
-    y = pd.read_pickle(file_interface)
-    # y = y.rolling(20).mean()[19:]  --> avoid when printing character error rate
-    preds[file_interface] = y #.sort_values('styletts2-valence')
-    print(f'\n\n         {file_interface}\n_____________________________\n',
-          f"{y['cer-prompt'].mean()=}",
-          f"{y['cer-styletts2'].mean()=}\n\n")
-# =================================== cER ---------------------------
-for lang in ['english',
-             'foreign']:
-            fig, ax = plt.subplots(nrows=8, ncols=2, figsize=(24,20.7),
-                                   gridspec_kw={'hspace': 0, 'wspace': .04})
-            time_stamp = np.arange(len(preds['english_analytic.pkl']))
-            _z = np.zeros(len(preds['english_analytic.pkl']))
-            for j, dim in enumerate(['arousal', 'dominance', 'valence']):
-                # MIMIC3
-                ax[j, 0].plot(time_stamp, preds[f'{lang}_analytic.pkl'][f'styletts2-{dim}'],
-                            color=(0,104/255,139/255),
-                            label='mean_1',
-                            linewidth=2)
-                ax[j, 0].fill_between(time_stamp,
-                                _z,
-                                preds['human_analytic.pkl'][f'styletts2-{dim}'],
-                                color=(.2,.2,.2),
-                                alpha=0.244)
-                if j == 0:
-                    if lang == 'english':
-                        desc = 'English'
-                    else:
-                        desc = 'Non-English'
-                    ax[j, 0].legend([f'StyleTTS2 using Mimic-3 {desc}',
-                                     f'StyleTTS2 uising EmoDB'],
-                                    prop={'size': 14},
-                                    )
-                ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=17)
-                # TICK
-                ax[j, 0].set_ylim([1e-7, .9999])
-                # ax[j, 0].set_yticks([.25, .5,.75])
-                # ax[j, 0].set_yticklabels(['0.25', '.5', '0.75'])
-                ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
-                ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
-            # MIMIC3   4x speed
-                ax[j, 1].plot(time_stamp, preds[f'{lang}_4x_analytic.pkl'][f'styletts2-{dim}'],
-                            color=(0,104/255,139/255),
-                            label='mean_1',
-                            linewidth=2)
-                ax[j, 1].fill_between(time_stamp,
-                                _z,
-                                preds['human_analytic.pkl'][f'styletts2-{dim}'],
-                                color=(.2,.2,.2),
-                                alpha=0.244)
-                if j == 0:
-                    if lang == 'english':
-                        desc = 'English'
-                    else:
-                        desc = 'Non-English'
-                    ax[j, 1].legend([f'StyleTTS2 using Mimic-3 {desc} 4x speed',
-                                    f'StyleTTS2 using EmoDB'],
-                                    prop={'size': 14},
-                                    #  loc='lower right'
-                                    )
-                ax[j, 1].set_xlabel('720 Harvard Sentences')
-                # TICK
-                ax[j, 1].set_ylim([1e-7, .9999])
-                # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()])
-                ax[j, 1].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
-                ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]])
-                ax[j, 0].grid()
-                ax[j, 1].grid()
-            # CATEGORIE
-            for j, dim in enumerate(['Angry',
-                                    'Sad',
-                                    'Happy',
-                                    #  'Surprise',
-                                    'Fear',
-                                    'Disgust',
-                                    #  'Contempt',
-                                    #  'Neutral'
-                                    ]):   # ASaHSuFDCN
-                j = j + 3  # skip A/D/V suplt
-                # MIMIC3
-                ax[j, 0].plot(time_stamp, preds[f'{lang}_analytic.pkl'][f'styletts2-{dim}'],
-                            color=(0,104/255,139/255),
-                            label='mean_1',
-                            linewidth=2)
-                ax[j, 0].fill_between(time_stamp,
-                                _z,
-                                preds['human_analytic.pkl'][f'styletts2-{dim}'],
-                                color=(.2,.2,.2),
-                                alpha=0.244)
-                # ax[j, 0].legend(['StyleTTS2 style mimic3',
-                #                  'StyleTTS2 style crema-d'],
-                #                  prop={'size': 10},
-                #                 #  loc='upper left'
-                # )
-                ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=17)
-                # TICKS
-                ax[j, 0].set_ylim([1e-7, .9999])
-                ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
-                ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
-                ax[j, 0].set_xlabel('720 Harvard Sentences', fontsize=17, color=(.2,.2,.2))
-            # MIMIC3   4x speed
-                ax[j, 1].plot(time_stamp, preds[f'{lang}_4x_analytic.pkl'][f'styletts2-{dim}'],
-                            color=(0,104/255,139/255),
-                            label='mean_1',
-                            linewidth=2)
-                ax[j, 1].fill_between(time_stamp,
-                                _z,
-                                preds['human_analytic.pkl'][f'styletts2-{dim}'],
-                                color=(.2,.2,.2),
-                                alpha=0.244)
-                # ax[j, 1].legend(['StyleTTS2 style mimic3   4x speed',
-                #                  'StyleTTS2 style crema-d'],
-                #                  prop={'size': 10},
-                #                 #  loc='upper left'
-                # )
-                ax[j, 1].set_xlabel('720 Harvard Sentences', fontsize=17, color=(.2,.2,.2))
-                ax[j, 1].set_ylim([1e-7, .9999])
-                # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()])
-                ax[j, 1].set_xticklabels(['' for _ in ax[j, 1].get_xticklabels()])
-                ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]])
-                ax[j, 0].grid()
-                ax[j, 1].grid()
-            plt.savefig(f'persentence_{lang}.pdf', bbox_inches='tight')
-            plt.close()