Nidum-Madurai-Tamil-TTS
🔊 Tamil Text-to-Speech (TTS) Model by Nidum
🧪 Live Demo: Try it on Hugging Face Spaces
🗣️ Overview
This is a high-quality Tamil Text-to-Speech (TTS) model developed by Nidum. It generates clear, natural-sounding Tamil speech from input text and is suitable for voice assistants, screen readers, language learning apps, and content narration.
🚀 Features
- ✅ Converts Tamil script text to speech
- ✅ Natural and expressive voice
- ✅ Option to choose male or female voice
- ✅ Easy-to-use demo via Hugging Face Spaces
🧪 Live Demo
Type Tamil text, select speaker, click Generate, and listen instantly!
🧑🎤 Speakers
Speaker ID | Voice |
---|---|
0 speaker |
Male |
1 speaker |
Female |
Use the appropriate speaker ID in your prompt like this:
0 speaker: வணக்கம்!
💻 Usage (Code Example)
import torch
import soundfile as sf
from transformers import AutoModelForCausalLM, AutoTokenizer
from snac import SNAC
# Path to your fine-tuned model
fine_tuned_checkpoint = "<Model_ID>"
# Load model & tokenizer
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(fine_tuned_checkpoint, torch_dtype=torch.bfloat16).cuda()
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_checkpoint)
# Load SNAC vocoder
print("Loading SNAC model...")
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to("cpu")
# Input prompt with speaker
prompts = [
"0 speaker: வணக்கம்! இந்த பயன்பாட்டை பயன்படுத்தி உங்கள் உரையை குரலாக்கலாம்."
]
# Tokenize
all_input_ids = [tokenizer(p, return_tensors="pt").input_ids for p in prompts]
start_token = torch.tensor([[128259]], dtype=torch.int64)
end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)
all_modified_input_ids = [torch.cat([start_token, ids, end_tokens], dim=1) for ids in all_input_ids]
# Pad
max_length = max([ids.shape[1] for ids in all_modified_input_ids])
all_padded_tensors, all_attention_masks = [], []
for modified_input_ids in all_modified_input_ids:
padding = max_length - modified_input_ids.shape[1]
padded_tensor = torch.cat([torch.full((1, padding), 128263, dtype=torch.int64), modified_input_ids], dim=1)
attention_mask = torch.cat([torch.zeros((1, padding), dtype=torch.int64), torch.ones((1, modified_input_ids.shape[1]), dtype=torch.int64)], dim=1)
all_padded_tensors.append(padded_tensor)
all_attention_masks.append(attention_mask)
input_ids = torch.cat(all_padded_tensors, dim=0).cuda()
attention_mask = torch.cat(all_attention_masks, dim=0).cuda()
# Generate
print("Generating speech...")
with torch.no_grad():
generated_ids = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=4800,
do_sample=True,
temperature=1,
top_p=1,
repetition_penalty=1.1,
num_return_sequences=1,
eos_token_id=128258,
)
# Extract audio codes
print("Parsing output...")
token_indices = (generated_ids == 128257).nonzero(as_tuple=True)
if len(token_indices[1]) > 0:
last_occurrence_idx = token_indices[1][-1].item()
cropped_tensor = generated_ids[:, last_occurrence_idx + 1:]
else:
cropped_tensor = generated_ids
processed_rows = [row[row != 128258] for row in cropped_tensor]
code_lists = []
for row in processed_rows:
row_length = row.size(0)
new_length = (row_length // 7) * 7
trimmed_row = row[:new_length] - 128266
code_lists.append(trimmed_row.tolist())
# Decode with SNAC
def redistribute_codes(code_list):
layer_1, layer_2, layer_3 = [], [], []
for i in range((len(code_list) + 1) // 7):
layer_1.append(code_list[7 * i])
layer_2.append(code_list[7 * i + 1] - 4096)
layer_3.append(code_list[7 * i + 2] - (2 * 4096))
layer_3.append(code_list[7 * i + 3] - (3 * 4096))
layer_2.append(code_list[7 * i + 4] - (4 * 4096))
layer_3.append(code_list[7 * i + 5] - (5 * 4096))
layer_3.append(code_list[7 * i + 6] - (6 * 4096))
codes = [
torch.tensor(layer_1).unsqueeze(0),
torch.tensor(layer_2).unsqueeze(0),
torch.tensor(layer_3).unsqueeze(0)
]
return snac_model.decode(codes)
print("Decoding speech...")
audio_samples = [redistribute_codes(codes) for codes in code_lists]
# Save audio
for i, samples in enumerate(audio_samples):
audio_data = samples.detach().squeeze().to("cpu").numpy()
sf.write(f"output_{i}.wav", audio_data, samplerate=24000)
print(f"Audio {i} saved as output_{i}.wav")
print("Done!")
📬 Contact
For questions, feedback, or collaboration:
📧 [email protected]
- Downloads last month
- 12
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
🙋
Ask for provider support
HF Inference deployability: The model has no library tag.