Very bad result in Italian - a setup problem?

#1
by davide445 - opened

Testing using this code, with attached results that are impossibile to use. Testing on CUDA 12.1 on a GTX 1070, wanted to ask if this is the status of italian TTS, or there is someting bad in my code.

import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf

device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-multilingual-v1.1").to(device)
tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-multilingual-v1.1")
description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)

prompt = "Cosa sappiamo dell’evoluzione di Homo Heidelbergensis in Europa? Quanto è cambiata la sua morfologia una volta arrivato nel nostro continente? Per rispondere a queste domande disponiamo di una ricca serie di fossili distribuiti in tutta l’Europa: 130 frammenti appartenenti a 7 o 8 individui nel sito di Tautavel nei Pirenei orientali, 3000 frammenti (tra cui dei crani completi) associati ad almeno 28 individui nel sito di Sima de los Huesos in Spagna, un cranio a Aroeira in Portogallo, un osso occipitale a Vértesszöllös in Ungheria, una calotta cranica a Ceprano, qualche dente a Visogliano e un femore a Venosa in Italia, una tibia a Boxgrove in Inghilterra, ed il primo fossile in assoluto di Homo Heidelbergensis: la mandibola di Mauer, in Germania."
description = "Julia's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise."

input_ids = description_tokenizer(description, return_tensors="pt").input_ids.to(device)
prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
audio_arr = generation.cpu().numpy().squeeze()
sf.write("parler_tts_out.wav", audio_arr, model.config.sampling_rate)

Parler TTS org

Hey @davide445 ,
Thanks for your message, the model was trained to generate speech from 2 to 30s. In other words, it can generate short prompts that corresponds to small audio snippet.

Given the length of your prompt, you probably have to generate it in multiple go.

Additionally, you can get inspiration from here: https://huggingface.co/spaces/ai4bharat/indic-parler-tts/blob/main/app.py#L179

To get both a long-form generation algorithm, and a proper use of the attention masks (don't forget to add prompt_attention_mask and attention_mask to the generate call).

Hope it helps

Thia code generate this anyway bad result

import torch
import nltk
import numpy as np
import io
import argparse
from pydub import AudioSegment
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
from tqdm import tqdm
import sys

Testo predefinito da utilizzare

DEFAULT_TEXT = """
La neve, il freddo e il ghiaccio portano disagi in Lombardia. Un nuovo peggioramento delle condizioni meteorologiche è previsto per i prossimi giorni, annuncia il Centro funzionale monitoraggio rischi della Regione. La Protezione civile ha diramato un'allerta meteo gialla (rischio ordinario) a partire dalle 3 di questa notte e per l'intera giornata di domani.
"""

DEFAULT_DESCRIPTION = """
Julia's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise.
"""

class ProgressBar:
def init(self, total_chunks):
self.total_chunks = total_chunks
self.current_chunk = 0
self.current_progress = 0

def update(self, chunk_progress):
    chunk_contribution = 100.0 / self.total_chunks
    total_progress = (self.current_chunk * chunk_contribution) + (chunk_progress * chunk_contribution / 100)

    if int(total_progress) > self.current_progress:
        self.current_progress = int(total_progress)
        sys.stdout.write(f'\rGenerazione audio: {self.current_progress}%')
        sys.stdout.flush()

def next_chunk(self):
    self.current_chunk += 1

def finish(self):
    sys.stdout.write('\rGenerazione audio: 100%\n')
    sys.stdout.flush()

def numpy_to_mp3(audio_array, sampling_rate):
if np.issubdtype(audio_array.dtype, np.floating):
max_val = np.max(np.abs(audio_array))
if max_val > 0:
audio_array = (audio_array / max_val) * 32767
audio_array = audio_array.astype(np.int16)

audio_segment = AudioSegment(
    audio_array.tobytes(),
    frame_rate=sampling_rate,
    sample_width=audio_array.dtype.itemsize,
    channels=1
)

return audio_segment

def generate_long_form_audio(text, description, model, tokenizer, description_tokenizer, device, chunk_size=25):
inputs = description_tokenizer(description, return_tensors="pt").to(device)

sentences = nltk.sent_tokenize(text)
curr_sentence = ""
chunks = []

for sentence in sentences:
    candidate = " ".join([curr_sentence, sentence]).strip()
    if len(candidate.split()) >= chunk_size:
        if curr_sentence:
            chunks.append(curr_sentence)
        curr_sentence = sentence
    else:
        curr_sentence = candidate

if curr_sentence:
    chunks.append(curr_sentence)

print(f"\nProcessing {len(chunks)} chunks")
progress_bar = ProgressBar(len(chunks))
all_audio = []

for i, chunk in enumerate(chunks):
    try:
        prompt = tokenizer(chunk, return_tensors="pt").to(device)

        with torch.cuda.amp.autocast(enabled=device.startswith("cuda")):
            generation = model.generate(
                input_ids=inputs.input_ids,
                attention_mask=inputs.attention_mask,
                prompt_input_ids=prompt.input_ids,
                prompt_attention_mask=prompt.attention_mask,
                do_sample=True,
                return_dict_in_generate=True
            )

        if hasattr(generation, 'sequences') and hasattr(generation, 'audios_length'):
            audio = generation.sequences[0, :generation.audios_length[0]]
            audio_np = audio.to(torch.float32).cpu().numpy().squeeze()
            if len(audio_np.shape) > 1:
                audio_np = audio_np.flatten()
            all_audio.append(audio_np)

        progress_bar.next_chunk()
        progress_bar.update(100)

    except Exception as e:
        print(f"\nError processing chunk {i+1}: {e}")
        continue

progress_bar.finish()

if all_audio:
    combined_audio = np.concatenate(all_audio)
    return combined_audio
else:
    raise Exception("No audio was generated")

def get_user_input():
print("\n=== Parler TTS Audio Generator ===")

# Selezione del dispositivo
print("\nDispositivi disponibili:")
print("1. CPU")
if torch.cuda.is_available():
    print("2. GPU (CUDA)")
    valid_choices = ['1', '2']
else:
    valid_choices = ['1']
    print("GPU non disponibile su questo sistema")

while True:
    device_choice = input("\nSeleziona il dispositivo da utilizzare (inserisci il numero): ").strip()
    if device_choice in valid_choices:
        break
    print("Scelta non valida. Riprova.")

use_cpu = device_choice == '1'

# Selezione del testo
print("\nOpzioni per il testo:")
print("1. Usa testo predefinito")
print("2. Inserisci nuovo testo")

while True:
    text_choice = input("\nSeleziona l'opzione per il testo (1 o 2): ").strip()
    if text_choice in ['1', '2']:
        break
    print("Scelta non valida. Riprova.")

if text_choice == '1':
    text = DEFAULT_TEXT
    print("\nUso il testo predefinito:")
    print(text)
else:
    print("\nInserisci il tuo testo (premi Invio due volte per terminare):")
    lines = []
    while True:
        line = input()
        if line == "":
            break
        lines.append(line)
    text = "\n".join(lines)

# Nome file di output
output_file = input("\nInserisci il nome del file di output (default: output.mp3): ").strip()
if not output_file:
    output_file = "output.mp3"
if not output_file.endswith('.mp3'):
    output_file += '.mp3'

return use_cpu, text, output_file

def main():
try:
# Ottieni input dall'utente
use_cpu, text, output_file = get_user_input()

    # Impostazione del device
    if use_cpu:
        device = "cpu"
        torch_dtype = torch.float32
        print("\nUsando CPU come richiesto")
    else:
        device = "cuda:0"
        torch_dtype = torch.float16
        print(f"\nUsando CUDA con device: {torch.cuda.get_device_name(0)}")

    # Download NLTK data
    nltk.download('punkt', quiet=True)

    print("\nCaricamento modelli...")
    model = ParlerTTSForConditionalGeneration.from_pretrained(
        "parler-tts/parler-tts-mini-multilingual-v1.1",
        torch_dtype=torch_dtype,
        device_map=device
    ).to(device)

    tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-multilingual-v1.1")
    description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
    print("Modelli caricati con successo")

    print("\nInizio generazione audio...")
    audio_arr = generate_long_form_audio(
        text,
        DEFAULT_DESCRIPTION,
        model,
        tokenizer,
        description_tokenizer,
        device
    )

    print(f"\nLunghezza audio generato: {round(len(audio_arr) / model.config.sampling_rate, 2)} secondi")
    audio_segment = numpy_to_mp3(audio_arr, model.config.sampling_rate)

    audio_segment.export(output_file, format="mp3", bitrate="320k")
    print(f"Audio salvato con successo in: {output_file}")

except Exception as e:
    print(f"\nErrore durante l'esecuzione: {e}")
    raise

if name == "main":
main()

Sign up or log in to comment