awacke1's picture
Update app.py
9994a37 verified
# =======================================================================================
# ScienceBrain.AI - A Multi-Modal Gradio App
# Author: Your Name Here (with a little help from a friendly AI)
# Version: 2.0 - Now with 100% more witty comments!
# =======================================================================================
# ---------------------------------------------------------------------------------------
# 🐍 Step 1: Import the Ark of Libraries
# "Any fool can write code that a computer can understand. Good programmers write code
# that humans can understand." - Martin Fowler. So let's make this understandable.
# ---------------------------------------------------------------------------------------
# --- Standard Library Imports ---
import os
import base64
import glob
import json
import re
import zipfile
from io import BytesIO
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
# --- Third-Party Imports ---
import gradio as gr
import openai
from openai import OpenAI
import pytz
import cv2
from PIL import Image
from PyPDF2 import PdfReader
from dotenv import load_dotenv
# ---------------------------------------------------------------------------------------
# βš™οΈ Step 2: Configuration & Global Domination... I mean, Initialization
# "The best way to predict the future is to invent it." - Alan Kay
# Here, we're inventing the configuration for our glorious app.
# ---------------------------------------------------------------------------------------
# Load environment variables from a .env file. Perfect for local testing.
# On Hugging Face Spaces, these should be set in the "Secrets" section.
load_dotenv()
# --- API Keys & Endpoints ---
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_ORG_ID = os.getenv('OPENAI_ORG_ID')
HF_KEY = os.getenv('HF_KEY') # Your Hugging Face Token, the key to the OSS kingdom.
FIREWORKS_API_BASE = "https://api.fireworks.ai/inference/v1"
# --- Model Configuration ---
DEFAULT_OSS_MODEL = "openai/gpt-oss-120b"
DEFAULT_OPENAI_MODEL = "gpt-4o-2024-05-13"
# Greetings from Mound, Minnesota! A little local flavor in the code.
# ---------------------------------------------------------------------------------------
# πŸ› οΈ Step 3: Core Helper Functions - The Unsung Heroes
# These functions do the behind-the-scenes work. They're like the roadies of a rock concert.
# ---------------------------------------------------------------------------------------
def get_llm_client(model_name: str) -> OpenAI:
"""
🐍 Creates and returns an OpenAI client configured for the selected model.
This function is the bouncer of our AI club. It checks your credentials (API keys)
and directs you to the right VIP lounge (API endpoint).
Args:
model_name (str): The name of the model to use.
Returns:
OpenAI: A configured OpenAI client instance.
Raises:
gr.Error: If the required API key for the selected model is not found.
"""
# If we're using the cool, open-source model...
if model_name == DEFAULT_OSS_MODEL:
if not HF_KEY:
# "Houston, we have a problem." - Apollo 13
raise gr.Error("Hugging Face API Key (HF_KEY) is missing! Add it to your Space secrets.")
# Point the client to the Fireworks.ai proxy for the OSS model.
return OpenAI(api_key=HF_KEY, base_url=FIREWORKS_API_BASE)
# Otherwise, for the standard OpenAI models...
else:
if not OPENAI_API_KEY:
# "I've got a bad feeling about this." - Han Solo
raise gr.Error("OpenAI API Key is missing! Add it to your Space secrets.")
# Use the standard OpenAI client configuration.
return OpenAI(api_key=OPENAI_API_KEY, organization=OPENAI_ORG_ID)
def generate_filename(prompt: str, file_type: str, original_name: str = None) -> str:
"""
πŸ’Ύ Generates a safe, unique, and descriptive filename.
Because 'output_1.txt' is for amateurs. We're creating masterpieces here,
and they deserve a proper name.
Args:
prompt (str): The user's prompt, used to make the name descriptive.
file_type (str): The file extension (e.g., "md", "png").
original_name (str, optional): The original name of an uploaded file.
Returns:
str: A clean, timestamped filename.
"""
# Get the current time in a sane timezone. No one likes UTC confusion.
central = pytz.timezone('US/Central')
safe_date_time = datetime.now(central).strftime("%m%d_%H%M")
# "I'm going to make him an offer he can't refuse." - The Godfather
# We're making an offer to the filesystem it can't refuse by removing illegal characters.
safe_prompt = re.sub(r'[<>:"/\\|?*\n]', ' ', prompt).strip()[:50]
# If it's a response to an uploaded file, include the original name.
if original_name:
base_name = os.path.splitext(original_name)[0]
file_stem = f"{safe_date_time}_{safe_prompt}_{base_name}"[:100]
else:
file_stem = f"{safe_date_time}_{safe_prompt}"[:100]
return f"{file_stem}.{file_type}"
def create_and_save_file(content: str, prompt: str, should_save: bool, file_type: str = "md", original_name: str = None):
"""
✍️ Saves content to a file if the user has blessed us with permission.
"With great power comes great responsibility." - Uncle Ben. The power to save
is in the user's hands.
Args:
content (str): The generated content to save.
prompt (str): The user prompt that generated the content.
should_save (bool): A flag indicating whether to save the file.
file_type (str, optional): The file extension. Defaults to "md".
original_name (str, optional): The original name of an input file.
"""
if not should_save:
print("πŸ’Ύ Save checkbox is unchecked. Skipping file save.")
return
filename = generate_filename(prompt, file_type, original_name)
try:
with open(filename, "w", encoding="utf-8") as f:
# We save both the prompt and the response for context. It's just polite.
full_content = f"πŸ’‘ PROMPT:\n{prompt}\n\n{'='*20}\n\nπŸ€– RESPONSE:\n{content}"
f.write(full_content)
print(f"βœ… Successfully saved conversation to {filename}")
except Exception as e:
# It's not a bug, it's an undocumented feature.
print(f"πŸ”₯ Error saving file {filename}: {e}")
# ---------------------------------------------------------------------------------------
# 🧠 Step 4: AI Processing Functions - Where the Magic Happens
# "Any sufficiently advanced technology is indistinguishable from magic." - Arthur C. Clarke
# ---------------------------------------------------------------------------------------
def process_text(client: OpenAI, model_name: str, history: list, text_input: str, should_save: bool) -> list:
"""
πŸ’¬ Handles a text-only prompt. The bread and butter of chat apps.
It's simple, elegant, and gets the job done. Like a little black dress.
Args:
client (OpenAI): The configured OpenAI client.
model_name (str): The name of the AI model.
history (list): The conversation history in OpenAI format.
text_input (str): The user's text prompt.
should_save (bool): Flag to determine if the output should be saved.
Returns:
list: The updated conversation history.
"""
history.append({"role": "user", "content": text_input})
completion = client.chat.completions.create(
model=model_name,
messages=history,
stream=False # We're keeping it simple for now. Streaming is a whole other party.
)
response = completion.choices[0].message.content
history.append({"role": "assistant", "content": response})
create_and_save_file(response, text_input, should_save)
return history
def process_image(client: OpenAI, model_name: str, history: list, image_path: str, user_prompt: str, should_save: bool) -> list:
"""
πŸ–ΌοΈ Processes an image with a text prompt. A picture is worth a thousand words,
but with AI, it can be worth a thousand lines of code, a poem, or a recipe.
Args:
client (OpenAI): The configured OpenAI client.
model_name (str): The name of the AI model.
history (list): The conversation history.
image_path (str): The local path to the uploaded image.
user_prompt (str): The text prompt accompanying the image.
should_save (bool): Flag to determine if the output should be saved.
Returns:
list: The updated conversation history.
"""
# "I'll be back." - The Terminator. The image will be back, but as Base64.
with open(image_path, "rb") as img_file:
base64_image = base64.b64encode(img_file.read()).decode("utf-8")
# Construct the special message format for multimodal input.
image_message = {
"role": "user",
"content": [
{"type": "text", "text": user_prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
]
}
history.append(image_message)
response = client.chat.completions.create(
model=model_name,
messages=history,
temperature=0.0 # Let's be factual with images.
)
image_response = response.choices[0].message.content
history.append({"role": "assistant", "content": image_response})
original_name = os.path.basename(image_path)
create_and_save_file(image_response, user_prompt, should_save, original_name=original_name)
return history
def process_audio(client: OpenAI, model_name: str, history: list, audio_path: str, user_prompt: str, should_save: bool) -> list:
"""
🎀 Transcribes audio using Whisper, then sends the transcript to the chat model.
"Listen to them. The children of the night. What music they make!" - Dracula.
We're listening, and turning that music into text.
Args:
client (OpenAI): The configured OpenAI client.
model_name (str): The name of the AI model.
history (list): The conversation history.
audio_path (str): Path to the uploaded audio file.
user_prompt (str): The text prompt to guide the response to the transcript.
should_save (bool): Flag to determine if the output should be saved.
Returns:
list: The updated conversation history.
"""
try:
with open(audio_path, "rb") as audio_file:
# Let Whisper do its thing. It's surprisingly good at it.
transcription = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file
).text
# Combine the user's prompt with the new transcript for full context.
full_prompt = f"{user_prompt}\n\n--- Audio Transcription ---\n{transcription}"
history.append({"role": "user", "content": full_prompt})
# Now, get a response from the main chat model.
completion = client.chat.completions.create(model=model_name, messages=history)
response = completion.choices[0].message.content
history.append({"role": "assistant", "content": response})
create_and_save_file(response, full_prompt, should_save, original_name=os.path.basename(audio_path))
except openai.BadRequestError as e:
raise gr.Error(f"Audio processing error: {e}")
except Exception as e:
raise gr.Error(f"An unexpected error occurred during audio processing: {e}")
return history
def process_video(client: OpenAI, model_name: str, history: list, video_path: str, user_prompt: str, should_save: bool) -> list:
"""
🎬 Processes a video by extracting frames and audio for a comprehensive summary.
"Life moves pretty fast. If you don't stop and look around once in a while,
you could miss it." - Ferris Bueller. We're stopping and looking, frame by frame.
Args:
client (OpenAI): The configured OpenAI client.
model_name (str): The name of the AI model.
history (list): The conversation history.
video_path (str): Path to the uploaded video file.
user_prompt (str): The text prompt for the video summary.
should_save (bool): Flag to determine if the output should be saved.
Returns:
list: The updated conversation history.
"""
try:
# --- Frame Extraction ---
# "I'm ready for my close-up, Mr. DeMille." - Sunset Boulevard
base64Frames = []
video = cv2.VideoCapture(video_path)
fps = video.get(cv2.CAP_PROP_FPS)
if not fps > 0: raise gr.Error("Could not read video file. Is it valid?")
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
frames_to_skip = int(fps * 2) # One frame every 2 seconds.
for i in range(0, total_frames, frames_to_skip):
video.set(cv2.CAP_PROP_POS_FRAMES, i)
success, frame = video.read()
if not success: break
_, buffer = cv2.imencode(".jpg", frame)
base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
video.release()
if not base64Frames: raise gr.Error("Could not extract any frames from the video.")
# --- Construct the Message Payload ---
# Start with the frames.
messages = [
{"type": "text", "text": "These are frames from a video. Please analyze them."},
*map(lambda x: {"type": "image_url", "image_url": {"url": f"data:image/jpg;base64,{x}", "detail": "low"}}, base64Frames),
]
# --- Audio Extraction & Transcription ---
audio_path = None
try:
with VideoFileClip(video_path) as clip:
if clip.audio:
audio_path = "temp_video_audio.mp3"
clip.audio.write_audiofile(audio_path, bitrate="32k", logger=None)
if audio_path:
with open(audio_path, "rb") as audio_file:
transcript = client.audio.transcriptions.create(model="whisper-1", file=audio_file).text
messages.append({"type": "text", "text": f"--- Video Transcription ---\n{transcript}"})
os.remove(audio_path) # "Leave the gun. Take the cannoli." - Clean up after ourselves.
except Exception as e:
print(f"⚠️ Audio extraction/transcription failed or skipped: {e}")
# Finally, add the user's guiding prompt.
messages.append({"type": "text", "text": user_prompt})
history.append({"role": "user", "content": messages})
# --- Get the Final AI Response ---
response = client.chat.completions.create(model=model_name, messages=history)
result = response.choices[0].message.content
history.append({"role": "assistant", "content": result})
create_and_save_file(result, user_prompt, should_save, original_name=os.path.basename(video_path))
return history
except Exception as e:
raise gr.Error(f"Video processing failed spectacularly: {str(e)}")
# ---------------------------------------------------------------------------------------
# πŸ–ΌοΈ Step 5: Gradio UI & Event Handlers - The Face of the Operation
# "The noblest art is that of making others happy." - P.T. Barnum.
# Our UI's goal is to make the user happy. Or at least not frustrated.
# ---------------------------------------------------------------------------------------
def convert_history_to_openai_format(gradio_history: list) -> list:
"""
πŸ”„ Converts Gradio's chat history format to the OpenAI API format.
It's like translating from English to Klingon, but for dictionaries.
Args:
gradio_history (list): History from a gr.Chatbot component.
Returns:
list: History formatted for the OpenAI API.
"""
openai_history = []
for user_msg, bot_msg in gradio_history:
if user_msg:
# Handle complex multimodal user messages for history
if isinstance(user_msg, tuple):
text, file_path = user_msg
# This part needs to be more robust if we want to "replay" multimodal history
openai_history.append({"role": "user", "content": text})
else:
openai_history.append({"role": "user", "content": user_msg})
if bot_msg:
openai_history.append({"role": "assistant", "content": bot_msg})
return openai_history
def get_file_processor(file_path: str):
"""
πŸ” Determines which processing function to use based on file extension.
A simple but elegant router. The Grand Central Station of file handling.
Args:
file_path (str): The path to the file.
Returns:
function: The appropriate processing function or None.
"""
ext = os.path.splitext(file_path)[1].lower()
if ext in ['.png', '.jpg', '.jpeg', '.gif', '.webp']: return process_image
if ext in ['.mp3', '.wav', 'm4a', 'flac', 'ogg']: return process_audio
if ext in ['.mp4', '.mov', 'avi']: return process_video
return None
def handle_multimodal_submit(message: dict, history: list, model_name: str, should_save: bool):
"""
πŸš€ The main event handler for the chat interface.
This function is the quarterback. It takes the snap (user input), reads the
defense (checks for files), and makes the play (calls the right processor).
Args:
message (dict): The output from the gr.MultimodalTextbox.
history (list): The current chat history in Gradio format.
model_name (str): The selected AI model.
should_save (bool): The state of the save checkbox.
Yields:
Updates to the Gradio UI components.
"""
text_prompt = message["text"]
files = message["files"]
# --- Optimistic UI Update ---
# Show the user's message in the chat immediately. It feels more responsive.
# "Carpe diem. Seize the day, boys. Make your lives extraordinary." - Dead Poets Society
user_turn_content = text_prompt
if files:
file_names = ", ".join([os.path.basename(f.name) for f in files])
user_turn_content += f"\n\n*πŸ“Ž Attached: {file_names}*"
history.append([user_turn_content, None])
yield history, gr.MultimodalTextbox(value=None, interactive=False) # Disable input while processing
try:
# Get the right AI client for the job.
client = get_llm_client(model_name)
# Convert history to the format our AI overlords demand.
openai_history = convert_history_to_openai_format(history[:-1]) # Exclude the current turn
# --- Route to the Correct Processor ---
if not files:
# It's just text. Easy peasy.
updated_openai_history = process_text(client, model_name, openai_history, text_prompt, should_save)
else:
# We have files! To the file-type-switch-case-mobile!
file_path = files[0].name # Process the first file.
processor = get_file_processor(file_path)
if processor:
updated_openai_history = processor(client, model_name, openai_history, file_path, text_prompt, should_save)
else:
raise gr.Error(f"Unsupported file type: {os.path.splitext(file_path)[1]}")
# Update the last message in the Gradio history with the AI's response.
history[-1][1] = updated_openai_history[-1]['content']
yield history, gr.MultimodalTextbox(value=None, interactive=True)
except Exception as e:
# "Well, nobody's perfect." - Some Like It Hot
# If something went wrong, let the user know and re-enable the input.
history[-1][1] = f"**πŸ”₯ An Error Occurred:** {str(e)}"
yield history, gr.MultimodalTextbox(value=message, interactive=True)
def update_file_list_display(file_types: list):
"""
πŸ”„ Refreshes the list of generated files in the sidebar.
It's like hitting F5, but with more Python.
"""
if not file_types: return gr.update(choices=[], value=[])
all_files = [f for f in glob.glob("*.*") if os.path.splitext(f)[1].lower() in file_types and len(os.path.splitext(f)[0]) >= 10]
all_files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
return gr.update(choices=all_files, value=[])
def delete_selected_files(files_to_delete: list, current_filter: list):
"""
πŸ—‘οΈ Deletes the files selected by the user. "Execute Order 66."
"""
if not files_to_delete:
gr.Warning("No files selected to delete. Are you just testing the button?")
return update_file_list_display(current_filter)
for file_path in files_to_delete:
try:
os.remove(file_path)
except OSError as e:
gr.Warning(f"Could not delete {file_path}. It's probably hiding. Error: {e}")
gr.Info(f"Deleted {len(files_to_delete)} files. They're gone. Reduced to atoms.")
return update_file_list_display(current_filter)
# ---------------------------------------------------------------------------------------
# πŸš€ Step 6: Main Application Entry Point - "Engage!"
# This is where we build the UI and launch the app into the digital cosmos.
# ---------------------------------------------------------------------------------------
with gr.Blocks(theme=gr.themes.Soft(primary_hue="teal", secondary_hue="orange"), title="ScienceBrain.AI") as demo:
gr.Markdown("# �🧠 ScienceBrain.AI\n*A Multi-Modal Interface for Advanced AI Models*")
with gr.Row():
with gr.Column(scale=1, min_width=300):
# --- Sidebar Controls ---
gr.Markdown("### βš™οΈ Controls")
model_selector = gr.Dropdown(
label="Select Model",
choices=[DEFAULT_OSS_MODEL, DEFAULT_OPENAI_MODEL, "gpt-4o-mini", "gpt-3.5-turbo"],
value=DEFAULT_OSS_MODEL,
)
save_checkbox = gr.Checkbox(label="πŸ’Ύ Save Session Output", value=True)
clear_btn = gr.Button("πŸ—‘οΈ Clear Session", variant="stop")
with gr.Accordion("πŸ“‚ File Management", open=False):
file_filter = gr.CheckboxGroup(
label="Filter by Type",
choices=[".md", ".png", ".jpg", ".pdf", ".wav", ".mp3", ".mp4"],
value=[".md", ".png"]
)
file_list = gr.CheckboxGroup(label="Generated Files (Select to Delete)", choices=[], value=[])
with gr.Row():
refresh_files_btn = gr.Button("πŸ”„ Refresh")
delete_files_btn = gr.Button("πŸ—‘οΈ Delete", variant="primary")
with gr.Column(scale=4):
# --- Main Chat Interface ---
chatbot = gr.Chatbot(
label="Conversation",
bubble_full_width=False,
height=650,
avatar_images=(None, "https://openmoji.org/data/color/svg/1F916.svg") # User, Robot
)
multimodal_input = gr.MultimodalTextbox(
file_types=["image", "audio", "video"],
placeholder="Type a message or upload a file...",
label="Your Input"
)
# --- Event Listener Wiring ---
# "There is no spoon." - The Matrix. There are only functions and triggers.
# Main submit action
multimodal_input.submit(
fn=handle_multimodal_submit,
inputs=[multimodal_input, chatbot, model_selector, save_checkbox],
outputs=[chatbot, multimodal_input]
)
# Clear chat action
clear_btn.click(fn=lambda: ([], []), inputs=None, outputs=[chatbot, chatbot])
# File management actions
refresh_files_btn.click(fn=update_file_list_display, inputs=[file_filter], outputs=[file_list])
file_filter.change(fn=update_file_list_display, inputs=[file_filter], outputs=[file_list])
delete_files_btn.click(fn=delete_selected_files, inputs=[file_list, file_filter], outputs=[file_list])
# Load initial file list when the app starts.
demo.load(fn=update_file_list_display, inputs=[file_filter], outputs=[file_list])
# "So, this is how liberty dies... with thunderous applause." - PadmΓ© Amidala
# Or, in our case, how an app starts... with a simple launch command.
if __name__ == "__main__":
demo.launch(debug=True)