Spaces:

hu-po
/

speech2speech

Build error

App Files Files Community

speech2speech / app.py

hu-po

release 0.3

3274df2 over 2 years ago

raw

history blame

11.3 kB

	import asyncio
	import logging
	import os
	import random
	from typing import Dict, List, Tuple

	import gradio as gr
	import yaml

	from src.elevenlabs import (Speaker, check_voice_exists, get_make_voice,
	play_history, save_history, set_elevenlabs_key)
	from src.openailib import top_response, speech_to_text, set_openai_key
	from src.tube import extract_audio

	logging.basicConfig(level=logging.INFO)
	log = logging.getLogger(__name__)


	class ConversationState:
	COLORS: list = ['#FFA07A', '#F08080', '#AFEEEE', '#B0E0E6', '#DDA0DD',
	'#FFFFE0', '#F0E68C', '#90EE90', '#87CEFA', '#FFB6C1']
	YAML_FILEPATH: str = os.path.join(os.path.dirname(__file__), 'voices.yaml')
	AUDIO_SAVEDIR: str = os.path.join(
	os.path.dirname(__file__), 'audio_export')

	def __init__(self,
	names: list = None,
	iam: str = None,
	model: str = "gpt-3.5-turbo",
	max_tokens: int = 30,
	temperature: float = 0.5,
	history: list = None):
	self.model = model
	self.max_tokens = max_tokens
	self.temperature = temperature
	# Make sure save dir exists, make any necessary directories
	os.makedirs(self.AUDIO_SAVEDIR, exist_ok=True)
	self.audio_savepath = os.path.join(
	self.AUDIO_SAVEDIR, 'conversation.wav')
	log.info(f"Resetting conversation")
	with open(self.YAML_FILEPATH, 'r') as file:
	self.characters_yaml = file.read()
	file.seek(0)
	self.characters_dict = yaml.safe_load(file)
	self.all_characters = [
	name for name in self.characters_dict.keys()]
	self.names = names or random.choices(self.all_characters, k=2)
	self.iam = iam or random.choice(self.names)
	assert self.iam in self.names, f"{self.iam} not in {self.names}"
	log.info(f"Loading voices")
	self.speakers: Dict[str, Speaker] = {}
	self.speakers_descriptions: str = ''
	for i, name in enumerate(self.names):
	if check_voice_exists(name) is None:
	log.warning(f"Voice {name} does not exist")
	continue
	_speaker = Speaker(
	name=name,
	voice=get_make_voice(name),
	color=self.COLORS[i % len(self.COLORS)],
	description=self.characters_dict[name].get(
	"description", None),
	)
	self.speakers[name] = _speaker
	if _speaker.description is not None:
	self.speakers_descriptions += f"{_speaker.name}: {_speaker.description}.\n"
	# System is fed into OpenAI to condition the prompt
	self.system = f"You create funny conversation dialogues."
	self.system += f"This conversation is between {', '.join(self.names)}."
	self.system += "Do not introduce new characters."
	self.system += "Descriptions for each of the characters are:\n"
	for speaker in self.speakers.values():
	self.system += f"{speaker.name}: {speaker.description}\n"
	self.system += "Only return one person's response at a time."
	self.system += "Each response must start with the character name, then a colon, then their response in a single line."
	self.system += "Keep the responses short and witty."
	self.system += "Make sure the responses are only one sentence long."
	self.system += "Do not continue a previous response. Always start a new response."
	# History is fed in at every step
	self.step = 0
	if history is None:
	self.history: List[Tuple[Speaker, str]] = []

	def add_to_history(self, text: str, speaker: Speaker = None):
	if speaker is None:
	speaker = self.speakers[self.iam]
	self.history.append((speaker, text))

	def history_to_prompt(self) -> str:
	prompt: str = ''
	for speaker, text in self.history:
	prompt += f"{speaker.name}:{text}\n"
	return prompt

	def html_history(self) -> str:
	history_html: str = ''
	for speaker, text in self.history:
	_bubble = f"<div style='background-color: {speaker.color}; border-radius: 5px; padding: 5px; margin: 5px;'>{speaker.name}: {text}</div>"
	history_html += _bubble
	return history_html


	# Storing state in the global scope like this is bad, but
	# perfect is the enemy of good enough and gradio is kind of shit
	STATE = ConversationState()


	def reset(names, iam, model, max_tokens, temperature):
	# Push new global state to the global scope
	global STATE
	STATE = ConversationState(
	names=names,
	iam=iam,
	model=model,
	max_tokens=max_tokens,
	temperature=temperature,
	)
	return STATE.html_history()


	def step_mic(audio):
	global STATE
	try:
	request = speech_to_text(audio)
	STATE.add_to_history(request)
	except TypeError as e:
	log.warning(e)
	pass
	return STATE.html_history()


	def step_continue():
	global STATE
	response = top_response(STATE.history_to_prompt(),
	system=STATE.system,
	model=STATE.model,
	max_tokens=STATE.max_tokens,
	temperature=STATE.temperature,
	)
	for line in response.splitlines():
	try:
	# TODO: Add any filters here as assertion errors
	if not line:
	continue
	assert ":" in line, f"Line {line} does not have a colon"
	name, text = line.split(":")
	assert name in STATE.all_characters, f"Name {name} is not in {STATE.all_characters}"
	speaker = STATE.speakers[name]
	assert len(text) > 0, f"Text {text} is empty"
	STATE.add_to_history(text, speaker=speaker)
	except AssertionError as e:
	log.warning(e)
	continue
	return STATE.html_history()


	def save_audio():
	global STATE
	log.info(f"Saving audio")
	asyncio.run(save_history(STATE.history, STATE.audio_savepath))
	return STATE.audio_savepath


	def play_audio():
	global STATE
	log.info(f"Playing audio")
	asyncio.run(play_history(STATE.history))
	return STATE.html_history()


	def make_voices(voices_yaml: str):
	global STATE
	try:
	STATE.characters_dict = yaml.safe_load(voices_yaml)
	for name, metadata in STATE.characters_dict.items():
	videos = metadata['references']
	assert isinstance(name, str), f"Name {name} is not a string"
	assert isinstance(videos, list), f"Videos {videos} is not a list"
	if check_voice_exists(name):
	continue
	audio_paths = []
	for i, video in enumerate(videos):
	assert isinstance(video, Dict), f"Video {video} is not a dict"
	assert 'url' in video, f"Video {video} does not have a url"
	url = video['url']
	start_minute = video.get('start_minute', 0)
	duration = video.get('duration_seconds', 120)
	label = os.path.join(STATE.AUDIO_SAVEDIR, f"audio.{name}.{i}")
	output_path = extract_audio(url, label, start_minute, duration)
	audio_paths.append(output_path)
	get_make_voice(name, audio_paths)
	except Exception as e:
	raise e
	# return f"Error: {e}"
	return "Success"


	# Define the main GradIO UI
	with gr.Blocks() as demo:
	gr.HTML('''<center><h1>Speech2Speech</h1></center>''')
	with gr.Tab("Conversation"):
	gr_convo_output = gr.HTML()
	with gr.Row():
	with gr.Column():
	gr_mic = gr.Audio(
	label="Record audio into conversation",
	source="microphone",
	type="filepath",
	)
	gr_add_button = gr.Button(value="Add to conversation")
	gr_playaudio_button = gr.Button(value="Play audio")
	gr_saveaudio_button = gr.Button(value="Export audio")
	gr_outputaudio = gr.Audio(
	label="Audio output",
	source="upload",
	type="filepath",
	)
	with gr.Column():
	gr_iam = gr.Dropdown(
	choices=STATE.all_characters, label="I am", value=STATE.iam)
	gr_chars = gr.CheckboxGroup(
	STATE.all_characters, label="Characters", value=STATE.names)
	gr_reset_button = gr.Button(value="Reset conversation")
	with gr.Accordion("Settings", open=False):
	openai_api_key_textbox = gr.Textbox(
	placeholder="Paste your OpenAI API key here",
	show_label=False,
	lines=1,
	type="password",
	)
	elevenlabs_api_key_textbox = gr.Textbox(
	placeholder="Paste your ElevenLabs API key here",
	show_label=False,
	lines=1,
	type="password",
	)
	gr_model = gr.Dropdown(choices=["gpt-3.5-turbo", "gpt-4"],
	label='GPT Model behind conversation', value=STATE.model)
	gr_max_tokens = gr.Slider(minimum=1, maximum=500, value=STATE.max_tokens,
	label="Max tokens", step=1)
	gr_temperature = gr.Slider(
	minimum=0.0, maximum=1.0, value=STATE.temperature, label="Temperature (randomness in conversation)")
	with gr.Tab("New Characters"):
	gr_make_voice_button = gr.Button(value="Update Characters")
	gr_voice_data = gr.Textbox(
	lines=25, label="Character YAML config", value=STATE.characters_yaml)
	gr_make_voice_output = gr.Textbox(
	lines=2, label="Character creation logs...")

	gr.HTML('''<center>
	Created by <a href="https://youtube.com/@hu-po">Hu Po</a> GitHub: <a href="https://github.com/hu-po/speech2speech">speech2speech</a>
	<br>
	Duplicate this space:<a href="https://huggingface.co/spaces/hu-po/speech2speech?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
	</center>
	''')

	# Buttons and actions
	gr_mic.change(step_mic, gr_mic, gr_convo_output)
	openai_api_key_textbox.change(set_openai_key, openai_api_key_textbox, None)
	elevenlabs_api_key_textbox.change(
	set_elevenlabs_key, elevenlabs_api_key_textbox, None)
	gr_add_button.click(step_continue, None, gr_convo_output)
	gr_reset_button.click(
	reset,
	inputs=[gr_chars, gr_iam, gr_model, gr_max_tokens, gr_temperature],
	outputs=[gr_convo_output],
	)
	gr_saveaudio_button.click(save_audio, None, gr_outputaudio)
	gr_playaudio_button.click(play_audio, None, None)
	gr_make_voice_button.click(
	make_voices, inputs=gr_voice_data, outputs=gr_make_voice_output,
	)

	if __name__ == "__main__":
	demo.launch()