import gradio as gr import openai import base64 from PIL import Image import io import fitz # ---------- PDF Text Extraction ---------- def extract_text_from_pdf(pdf_file): try: text = "" pdf_document = fitz.open(pdf_file) for page in pdf_document: text += page.get_text() pdf_document.close() return text except Exception as e: return f"Error extracting text from PDF: {str(e)}" # ---------- PDF Quiz Generation ---------- def generate_mcq_quiz(pdf_content, num_questions, openai_api_key, model_choice): if not openai_api_key: return "Error: No API key provided." openai.api_key = openai_api_key limited_content = pdf_content[:8000] if len(pdf_content) > 8000 else pdf_content prompt = f"""Based on the following document content, generate {num_questions} multiple-choice quiz questions. For each question: 1. Create a clear question based on key concepts in the document 2. Provide 4 options (A, B, C, D) 3. Indicate the correct answer 4. Briefly explain the correct answer Document content: {limited_content} """ try: response = openai.ChatCompletion.create( model=model_choice, messages=[{"role": "user", "content": prompt}] ) return response.choices[0].message.content except Exception as e: return f"Error generating quiz: {str(e)}" # ---------- Image Processing ---------- def generate_image_response(input_text, image, openai_api_key, model_choice): if not openai_api_key: return "Error: No API key provided." openai.api_key = openai_api_key # Convert image to base64 buffered = io.BytesIO() image.save(buffered, format="PNG") base64_str = base64.b64encode(buffered.getvalue()).decode("utf-8") try: response = openai.ChatCompletion.create( model=model_choice, messages=[ { "role": "user", "content": [ {"type": "text", "text": input_text}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_str}"}} ] } ] ) return response.choices[0].message.content except Exception as e: return f"Error processing image: {str(e)}" # ---------- Voice Processing ---------- def process_voice_input(audio_path, openai_api_key, model_choice): if not openai_api_key: return "Error: No API key provided." try: openai.api_key = openai_api_key audio_file = open(audio_path, "rb") transcript = openai.Audio.transcribe("whisper-1", audio_file) prompt = transcript["text"] audio_file.close() response = openai.ChatCompletion.create( model=model_choice, messages=[{"role": "user", "content": prompt}] ) return response.choices[0].message.content, prompt except Exception as e: return f"Error processing voice: {str(e)}", "" # ---------- Unified Chatbot Handler ---------- def chatbot(input_text, image, pdf_file, audio_file, openai_api_key, model_choice, pdf_content, num_quiz_questions, pdf_quiz_mode, audio_mode, history): if history is None: history = [] new_pdf_content = pdf_content # Handle PDF file upload and extract text if pdf_file is not None: new_pdf_content = extract_text_from_pdf(pdf_file) # Handle PDF Quiz Mode if pdf_quiz_mode: if new_pdf_content: quiz_response = generate_mcq_quiz(new_pdf_content, int(num_quiz_questions), openai_api_key, model_choice) history.append((f"👤: [PDF Quiz - {num_quiz_questions} questions]", f"🤖: {quiz_response}")) else: history.append(("👤: [PDF Quiz]", "🤖: Please upload a PDF file first.")) # Handle Audio Mode elif audio_mode: if audio_file is not None: response, transcribed_text = process_voice_input(audio_file, openai_api_key, model_choice) history.append((f"👤 (Voice): {transcribed_text}", f"🤖: {response}")) else: history.append(("👤: [Audio]", "🤖: Please upload or record an audio file.")) # Handle Image Mode else: if image is not None: response = generate_image_response(input_text, image, openai_api_key, model_choice) history.append((f"👤: {input_text or '[Image]'}", f"🤖: {response}")) elif input_text: # Handle text-only input when no image is provided try: openai.api_key = openai_api_key response = openai.ChatCompletion.create( model=model_choice, messages=[{"role": "user", "content": input_text}] ) history.append((f"👤: {input_text}", f"🤖: {response.choices[0].message.content}")) except Exception as e: history.append((f"👤: {input_text}", f"🤖: Error: {str(e)}")) return "", None, None, None, new_pdf_content, history # ---------- Clear Chat ---------- def clear_history(): return "", None, None, None, "", [] # ---------- Input Type Toggle ---------- def update_input_type(choice): if choice == "Image": hint_text = """ 💡 **Image Mode Tips:** - Both **o1** and **o3-mini** support image analysis - o1 provides more detailed analysis but costs more - o3-mini is faster and more cost-effective for simple image questions """ return ( gr.update(visible=True), # input_text gr.update(visible=True), # image_input gr.update(visible=False), # pdf_input gr.update(visible=False), # audio_input gr.update(visible=False), # quiz_slider gr.update(value=False), # pdf_quiz_mode gr.update(value=False), # audio_mode gr.update(value=hint_text, visible=True) # model_hint ) elif choice == "PDF(QUIZ)": hint_text = """ 📚 **PDF Quiz Mode Tips:** - Both models can generate quizzes from PDF content - o1 creates more comprehensive and detailed questions - o3-mini generates quizzes faster with good quality - Large PDFs are automatically limited to first 8000 characters """ return ( gr.update(visible=False), # input_text gr.update(visible=False), # image_input gr.update(visible=True), # pdf_input gr.update(visible=False), # audio_input gr.update(visible=True), # quiz_slider gr.update(value=True), # pdf_quiz_mode gr.update(value=False), # audio_mode gr.update(value=hint_text, visible=True) # model_hint ) elif choice == "Audio": hint_text = """ 🎤 **Audio Mode Tips:** - **Important:** Audio transcription uses OpenAI's `whisper-1` model (separate cost) - **gpt-4 transcribe**: More sophisticated responses but higher cost per token - **gpt-4-mini-transcribe**: Cost-effective for most audio conversations - Supports common audio formats (MP3, WAV, M4A, etc.) - Maximum audio file size: 25MB """ return ( gr.update(visible=False), # input_text gr.update(visible=False), # image_input gr.update(visible=False), # pdf_input gr.update(visible=True), # audio_input gr.update(visible=False), # quiz_slider gr.update(value=False), # pdf_quiz_mode gr.update(value=True), # audio_mode gr.update(value=hint_text, visible=True) # model_hint ) # ---------- CSS Styling ---------- custom_css = """ .gradio-container { font-family: 'Arial', sans-serif; background-color: #f0f4f8; } .gradio-header { background: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%); color: white; padding: 20px; border-radius: 8px; text-align: center; } #submit-btn { background: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%); color: white; border-radius: 8px; } #clear-history { background: linear-gradient(135deg, #e53e3e 0%, #f56565 100%); color: white; border-radius: 8px; } """ # ---------- UI Interface ---------- def create_interface(): with gr.Blocks(css=custom_css) as demo: gr.Markdown("""

Multimodal Chatbot (Image + PDF Quiz + Voice)

Ask via image, PDF, or voice

""") with gr.Accordion("Instructions", open=False): gr.Markdown(""" - **Image Chat**: Upload an image and ask about it - **PDF Quiz**: Upload a PDF and generate MCQs - **Audio Chat**: Upload or record audio to chat - Always provide your OpenAI API key """) # State variables pdf_content = gr.State("") with gr.Row(): openai_api_key = gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-...") with gr.Row(): input_type = gr.Radio(["Image", "PDF(QUIZ)", "Audio"], label="Input Type", value="Image") # Model-specific hints that appear based on input type model_hint = gr.Markdown("", visible=False) # Input components row - all in one organized row with gr.Row(): input_text = gr.Textbox(label="Question (for images)", visible=True) image_input = gr.Image(label="Upload Image", type="pil", visible=True) pdf_input = gr.File(label="Upload PDF", visible=False) audio_input = gr.Audio(label="Upload/Record Audio", type="filepath", visible=False) quiz_slider = gr.Slider(1, 20, value=5, step=1, label="Number of Questions", visible=False) # Hidden state components for mode control pdf_quiz_mode = gr.Checkbox(visible=False, value=False) audio_mode = gr.Checkbox(visible=False, value=False) with gr.Row(): model_choice = gr.Dropdown(["o1", "o3-mini","whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe",], label="Model", value="o1") submit_btn = gr.Button("Submit", elem_id="submit-btn") clear_btn = gr.Button("Clear History", elem_id="clear-history") chat_history = gr.Chatbot() # Event handlers input_type.change( update_input_type, inputs=[input_type], outputs=[input_text, image_input, pdf_input, audio_input, quiz_slider, pdf_quiz_mode, audio_mode, model_hint] ) submit_btn.click( chatbot, inputs=[input_text, image_input, pdf_input, audio_input, openai_api_key, model_choice, pdf_content, quiz_slider, pdf_quiz_mode, audio_mode, chat_history], outputs=[input_text, image_input, pdf_input, audio_input, pdf_content, chat_history] ) clear_btn.click( clear_history, outputs=[input_text, image_input, pdf_input, audio_input, pdf_content, chat_history] ) return demo # ---------- Launch ---------- if __name__ == "__main__": demo = create_interface() demo.launch()