# Without Voice Cloning

In [1]:
import torch
from transformers import AutoTokenizer, AutoProcessor, CsmForConditionalGeneration
from tokenizers.processors import TemplateProcessing
import soundfile as sf

model_id = "Marvis-AI/marvis-tts-0.25b-expressive-preview-transformers"
device = "cuda"if torch.cuda.is_available() else "cpu"

# load the model and the processor
processor = AutoProcessor.from_pretrained(model_id)
model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [5]:
# prepare the inputs
text = "[0]Hello from Marvis." # `[0]` for speaker id 0
inputs = processor(text, add_special_tokens=True, return_tensors="pt").to(device)
inputs

{'input_ids': tensor([[ 1, 75, 32, 77, 19556, 429, 2828, 3966, 30, 2]],
 device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [7]:
inputs.pop("token_type_ids")
# infer the model
audio = model.generate(**inputs, output_audio=True)
sf.write("example_without_context.wav", audio[0].cpu(), samplerate=24_000, subtype="PCM_16")

# With Voice Cloning

In [2]:
import torch
from datasets import load_dataset, Audio
from transformers import AutoTokenizer, AutoProcessor, CsmForConditionalGeneration
from tokenizers.processors import TemplateProcessing
import soundfile as sf


# prepare the inputs
ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
# ensure the audio is 24kHz
ds = ds.cast_column("audio", Audio(sampling_rate=24000))
conversation = []

# 1. context
for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
 conversation.append(
 {
 "role": f"{speaker_id}",
 "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
 }
 )

# 2. text prompt
conversation.append({"role": f"{ds[4]['speaker_id']}", "content": [{"type": "text", "text": ds[4]["text"]}]})

inputs = processor.apply_chat_template(
 conversation,
 tokenize=True,
 return_dict=True,
).to(device)


In [3]:
inputs

{'input_ids': tensor([[ 1, 75, 33, 77, 1780, 359, 346, 1891, 335, 47,
 2, 44, 108, 49, 11911, 8772, 108, 21198, 108, 49,
 11911, 8772, 108, 21198, 108, 49, 11911, 8772, 108, 21198,
 108, 49, 11911, 8772, 108, 21198, 108, 49, 11911, 8772,
 108, 21198, 108, 49, 11911, 8772, 108, 21198, 108, 49,
 11911, 8772, 108, 21198, 108, 49, 11911, 8772, 108, 21198,
 108, 49, 11911, 8772, 108, 21198, 108, 49, 11911, 8772,
 108, 21198, 108, 49, 11911, 8772, 108, 21198, 108, 49,
 11911, 8772, 108, 21198, 108, 49, 11911, 8772, 108, 21198,
 108, 49, 11911, 8772, 108, 21198, 108, 49, 11911, 8772,
 108, 21198, 108, 49, 11911, 8772, 108, 21198, 108, 49,
 11911, 8772, 108, 21198, 108, 49, 11911, 8772, 108, 21198,
 108, 49, 11911, 8772, 108, 21198, 108, 49, 11911, 8772,
 108, 21198, 108, 25492, 79, 85, 395, 108, 46, 1,
 75, 32, 77, 57, 5248, 23154, 578, 957, 6050, 30,
 2, 44, 108, 49, 11911, 8772, 108, 21198, 108, 49,
 11911, 8772, 108, 21198, 108, 49, 11911, 8772, 108, 21198,
 108, 49, 11911, 8772, 108, 2119

In [5]:
# infer the model
# inputs.pop("token_type_ids")
audio = model.generate(**inputs, output_audio=True)
sf.write("example_with_context.wav", audio[0].cpu(), samplerate=24_000, subtype="PCM_16")

In [5]:
processor.tokenizer.decode(inputs["input_ids"][0])

"<|im_start|>[1]What are you working on?<|im_end|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|audio_eos|><|im_start|>[0]I'm figuring out my budget.<|im_end|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|audio_eos|><|im_start|>[1]Umm…. What budget?<|im_end|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|audio_eos|><|im_start|>[0]I'm making a shopping budget, so that I don't spend too much money.<|im_end|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AUDIO|><|AU