Spaces:
Build error
Build error
| import asyncio | |
| import logging | |
| import os | |
| import random | |
| from typing import Dict, List, Tuple | |
| import gradio as gr | |
| import yaml | |
| from src.elevenlabs import (Speaker, check_voice_exists, get_make_voice, | |
| play_history, save_history, set_elevenlabs_key) | |
| from src.openailib import top_response, speech_to_text, set_openai_key | |
| from src.tube import extract_audio | |
| logging.basicConfig(level=logging.INFO) | |
| log = logging.getLogger(__name__) | |
| class ConversationState: | |
| COLORS: list = ['#FFA07A', '#F08080', '#AFEEEE', '#B0E0E6', '#DDA0DD', | |
| '#FFFFE0', '#F0E68C', '#90EE90', '#87CEFA', '#FFB6C1'] | |
| YAML_FILEPATH: str = os.path.join(os.path.dirname(__file__), 'voices.yaml') | |
| AUDIO_SAVEDIR: str = os.path.join( | |
| os.path.dirname(__file__), 'audio_export') | |
| def __init__(self, | |
| names: list = None, | |
| iam: str = None, | |
| model: str = "gpt-3.5-turbo", | |
| max_tokens: int = 30, | |
| temperature: float = 0.5, | |
| history: list = None): | |
| self.model = model | |
| self.max_tokens = max_tokens | |
| self.temperature = temperature | |
| # Make sure save dir exists, make any necessary directories | |
| os.makedirs(self.AUDIO_SAVEDIR, exist_ok=True) | |
| self.audio_savepath = os.path.join( | |
| self.AUDIO_SAVEDIR, 'conversation.wav') | |
| log.info(f"Resetting conversation") | |
| with open(self.YAML_FILEPATH, 'r') as file: | |
| self.characters_yaml = file.read() | |
| file.seek(0) | |
| self.characters_dict = yaml.safe_load(file) | |
| self.all_characters = [ | |
| name for name in self.characters_dict.keys()] | |
| self.names = names or random.choices(self.all_characters, k=2) | |
| self.iam = iam or random.choice(self.names) | |
| assert self.iam in self.names, f"{self.iam} not in {self.names}" | |
| log.info(f"Loading voices") | |
| self.speakers: Dict[str, Speaker] = {} | |
| self.speakers_descriptions: str = '' | |
| for i, name in enumerate(self.names): | |
| if check_voice_exists(name) is None: | |
| log.warning(f"Voice {name} does not exist") | |
| continue | |
| _speaker = Speaker( | |
| name=name, | |
| voice=get_make_voice(name), | |
| color=self.COLORS[i % len(self.COLORS)], | |
| description=self.characters_dict[name].get( | |
| "description", None), | |
| ) | |
| self.speakers[name] = _speaker | |
| if _speaker.description is not None: | |
| self.speakers_descriptions += f"{_speaker.name}: {_speaker.description}.\n" | |
| # System is fed into OpenAI to condition the prompt | |
| self.system = f"You create funny conversation dialogues." | |
| self.system += f"This conversation is between {', '.join(self.names)}." | |
| self.system += "Do not introduce new characters." | |
| self.system += "Descriptions for each of the characters are:\n" | |
| for speaker in self.speakers.values(): | |
| self.system += f"{speaker.name}: {speaker.description}\n" | |
| self.system += "Only return one person's response at a time." | |
| self.system += "Each response must start with the character name, then a colon, then their response in a single line." | |
| self.system += "Keep the responses short and witty." | |
| self.system += "Make sure the responses are only one sentence long." | |
| self.system += "Do not continue a previous response. Always start a new response." | |
| # History is fed in at every step | |
| self.step = 0 | |
| if history is None: | |
| self.history: List[Tuple[Speaker, str]] = [] | |
| def add_to_history(self, text: str, speaker: Speaker = None): | |
| if speaker is None: | |
| speaker = self.speakers[self.iam] | |
| self.history.append((speaker, text)) | |
| def history_to_prompt(self) -> str: | |
| prompt: str = '' | |
| for speaker, text in self.history: | |
| prompt += f"{speaker.name}:{text}\n" | |
| return prompt | |
| def html_history(self) -> str: | |
| history_html: str = '' | |
| for speaker, text in self.history: | |
| _bubble = f"<div style='background-color: {speaker.color}; border-radius: 5px; padding: 5px; margin: 5px;'>{speaker.name}: {text}</div>" | |
| history_html += _bubble | |
| return history_html | |
| # Storing state in the global scope like this is bad, but | |
| # perfect is the enemy of good enough and gradio is kind of shit | |
| STATE = ConversationState() | |
| def reset(names, iam, model, max_tokens, temperature): | |
| # Push new global state to the global scope | |
| global STATE | |
| STATE = ConversationState( | |
| names=names, | |
| iam=iam, | |
| model=model, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| ) | |
| return STATE.html_history() | |
| def step_mic(audio): | |
| global STATE | |
| try: | |
| request = speech_to_text(audio) | |
| STATE.add_to_history(request) | |
| except TypeError as e: | |
| log.warning(e) | |
| pass | |
| return STATE.html_history() | |
| def step_continue(): | |
| global STATE | |
| response = top_response(STATE.history_to_prompt(), | |
| system=STATE.system, | |
| model=STATE.model, | |
| max_tokens=STATE.max_tokens, | |
| temperature=STATE.temperature, | |
| ) | |
| for line in response.splitlines(): | |
| try: | |
| # TODO: Add any filters here as assertion errors | |
| if not line: | |
| continue | |
| assert ":" in line, f"Line {line} does not have a colon" | |
| name, text = line.split(":") | |
| assert name in STATE.all_characters, f"Name {name} is not in {STATE.all_characters}" | |
| speaker = STATE.speakers[name] | |
| assert len(text) > 0, f"Text {text} is empty" | |
| STATE.add_to_history(text, speaker=speaker) | |
| except AssertionError as e: | |
| log.warning(e) | |
| continue | |
| return STATE.html_history() | |
| def save_audio(): | |
| global STATE | |
| log.info(f"Saving audio") | |
| asyncio.run(save_history(STATE.history, STATE.audio_savepath)) | |
| return STATE.audio_savepath | |
| def play_audio(): | |
| global STATE | |
| log.info(f"Playing audio") | |
| asyncio.run(play_history(STATE.history)) | |
| return STATE.html_history() | |
| def make_voices(voices_yaml: str): | |
| global STATE | |
| try: | |
| STATE.characters_dict = yaml.safe_load(voices_yaml) | |
| for name, metadata in STATE.characters_dict.items(): | |
| videos = metadata['references'] | |
| assert isinstance(name, str), f"Name {name} is not a string" | |
| assert isinstance(videos, list), f"Videos {videos} is not a list" | |
| if check_voice_exists(name): | |
| continue | |
| audio_paths = [] | |
| for i, video in enumerate(videos): | |
| assert isinstance(video, Dict), f"Video {video} is not a dict" | |
| assert 'url' in video, f"Video {video} does not have a url" | |
| url = video['url'] | |
| start_minute = video.get('start_minute', 0) | |
| duration = video.get('duration_seconds', 120) | |
| label = os.path.join(STATE.AUDIO_SAVEDIR, f"audio.{name}.{i}") | |
| output_path = extract_audio(url, label, start_minute, duration) | |
| audio_paths.append(output_path) | |
| get_make_voice(name, audio_paths) | |
| except Exception as e: | |
| raise e | |
| # return f"Error: {e}" | |
| return "Success" | |
| # Define the main GradIO UI | |
| with gr.Blocks() as demo: | |
| gr.HTML('''<center><h1>Speech2Speech</h1></center>''') | |
| with gr.Tab("Conversation"): | |
| gr_convo_output = gr.HTML() | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr_mic = gr.Audio( | |
| label="Record audio into conversation", | |
| source="microphone", | |
| type="filepath", | |
| ) | |
| gr_add_button = gr.Button(value="Add to conversation") | |
| gr_playaudio_button = gr.Button(value="Play audio") | |
| gr_saveaudio_button = gr.Button(value="Export audio") | |
| gr_outputaudio = gr.Audio( | |
| label="Audio output", | |
| source="upload", | |
| type="filepath", | |
| ) | |
| with gr.Column(): | |
| gr_iam = gr.Dropdown( | |
| choices=STATE.all_characters, label="I am", value=STATE.iam) | |
| gr_chars = gr.CheckboxGroup( | |
| STATE.all_characters, label="Characters", value=STATE.names) | |
| gr_reset_button = gr.Button(value="Reset conversation") | |
| with gr.Accordion("Settings", open=False): | |
| openai_api_key_textbox = gr.Textbox( | |
| placeholder="Paste your OpenAI API key here", | |
| show_label=False, | |
| lines=1, | |
| type="password", | |
| ) | |
| elevenlabs_api_key_textbox = gr.Textbox( | |
| placeholder="Paste your ElevenLabs API key here", | |
| show_label=False, | |
| lines=1, | |
| type="password", | |
| ) | |
| gr_model = gr.Dropdown(choices=["gpt-3.5-turbo", "gpt-4"], | |
| label='GPT Model behind conversation', value=STATE.model) | |
| gr_max_tokens = gr.Slider(minimum=1, maximum=500, value=STATE.max_tokens, | |
| label="Max tokens", step=1) | |
| gr_temperature = gr.Slider( | |
| minimum=0.0, maximum=1.0, value=STATE.temperature, label="Temperature (randomness in conversation)") | |
| with gr.Tab("New Characters"): | |
| gr_make_voice_button = gr.Button(value="Update Characters") | |
| gr_voice_data = gr.Textbox( | |
| lines=25, label="Character YAML config", value=STATE.characters_yaml) | |
| gr_make_voice_output = gr.Textbox( | |
| lines=2, label="Character creation logs...") | |
| gr.HTML('''<center> | |
| Created by <a href="https://youtube.com/@hu-po">Hu Po</a> GitHub: <a href="https://github.com/hu-po/speech2speech">speech2speech</a> | |
| <br> | |
| Duplicate this space:<a href="https://huggingface.co/spaces/hu-po/speech2speech?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> | |
| </center> | |
| ''') | |
| # Buttons and actions | |
| gr_mic.change(step_mic, gr_mic, gr_convo_output) | |
| openai_api_key_textbox.change(set_openai_key, openai_api_key_textbox, None) | |
| elevenlabs_api_key_textbox.change( | |
| set_elevenlabs_key, elevenlabs_api_key_textbox, None) | |
| gr_add_button.click(step_continue, None, gr_convo_output) | |
| gr_reset_button.click( | |
| reset, | |
| inputs=[gr_chars, gr_iam, gr_model, gr_max_tokens, gr_temperature], | |
| outputs=[gr_convo_output], | |
| ) | |
| gr_saveaudio_button.click(save_audio, None, gr_outputaudio) | |
| gr_playaudio_button.click(play_audio, None, None) | |
| gr_make_voice_button.click( | |
| make_voices, inputs=gr_voice_data, outputs=gr_make_voice_output, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |