Spaces:

hasanbasbunar
/

Nano-Vision-Studio

Running

File size: 20,375 Bytes

import gradio as gr
from google import genai
from google.genai import types
from PIL import Image
import os
import io
import uuid
import threading
import time
from dotenv import load_dotenv

load_dotenv()

# --- CONFIGURATION CONFORME DOC ---
MODELS = {
    "🧠 Gemini 3 Pro Preview (Recommended)": "gemini-3-pro-image-preview",
    "⚡ Gemini 2.5 Flash (Fast)": "gemini-2.5-flash-image"
}

# Valeurs exactes demandées par la documentation
RATIOS = ["1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"]
RESOLUTIONS = ["1K", "2K", "4K"] # Uppercase 'K' mandatory per docs

TEMP_CHAT_DIR = "temp_chat_images"
os.makedirs(TEMP_CHAT_DIR, exist_ok=True)

# --- UTILS ---

def get_client(api_key):
    if not api_key: raise gr.Error("API Key manquante")
    return genai.Client(api_key=api_key)

def safe_process_image(part):
    """Convertit les données inline brutes en Image PIL."""
    try:
        if part.inline_data and hasattr(part.inline_data, 'data'):
            return Image.open(io.BytesIO(part.inline_data.data))
        if hasattr(part, 'as_image'):
            img = part.as_image()
            if hasattr(img, 'image'): return img.image
            return img
        return None
    except Exception as e:
        print(f"⚠️ Image conversion error: {e}")
        return None

def process_response(response):
    """Sépare le résultat final, le processus de pensée (Thinking Mode) et les sources."""
    final_imgs, final_txt = [], ""
    thought_imgs, thought_txt = [], ""
    sources_html = None # <--- AJOUT : Conteneur pour les sources

    if not response or not response.parts:
        return final_imgs, final_txt, thought_imgs, thought_txt, sources_html

    print(f"\n--- RECEIVED ({len(response.parts)} parts) ---")
    
    # 1. Parsing du contenu (Images & Texte & Pensées)
    for i, part in enumerate(response.parts):
        is_thought = getattr(part, 'thought', False)
        
        if is_thought:
            if part.text: thought_txt += part.text + "\n"
            if part.inline_data:
                img = safe_process_image(part)
                if img: thought_imgs.append(img)
        else:
            if part.text: final_txt += part.text + "\n"
            if part.inline_data:
                img = safe_process_image(part)
                if img: final_imgs.append(img)

    # 2. Parsing du Grounding (Sources) - AJOUT
    # Les métadonnées sont au niveau du 'candidate', pas des 'parts'
    if response.candidates and response.candidates[0].grounding_metadata:
        gm = response.candidates[0].grounding_metadata
        if gm.search_entry_point and gm.search_entry_point.rendered_content:
            sources_html = gm.search_entry_point.rendered_content

    return final_imgs, final_txt, thought_imgs, thought_txt, sources_html

# --- WORKER NETTOYAGE ---

def cleanup_old_files():
    """Supprime les fichiers vieux de plus de 1h toutes les 10 minutes."""
    while True:
        try:
            now = time.time()
            cutoff = now - 3600  # 1 heure
            if os.path.exists(TEMP_CHAT_DIR):
                for filename in os.listdir(TEMP_CHAT_DIR):
                    filepath = os.path.join(TEMP_CHAT_DIR, filename)
                    if os.path.isfile(filepath):
                        if os.path.getmtime(filepath) < cutoff:
                            try:
                                os.remove(filepath)
                                print(f"🧹 Supprimé : {filename}")
                            except Exception:
                                pass
        except Exception as e:
            print(f"⚠️ Erreur worker : {e}")
        time.sleep(600)

# --- BACKEND FUNCTIONS ---

def update_api_key(new_key):
    if not new_key:
        return "⚠️ Clé invalide", None
    return "✅ Clé enregistrée pour cette session !", new_key

def generate_studio(prompt, model_ui, ratio, resolution, grounding, user_api_key):
    """Text-to-Image standard respectant types.GenerateContentConfig"""
    cli = get_client(user_api_key)
    model_name = MODELS[model_ui]
    
    # Configuration stricte selon la doc
    img_conf = {"aspect_ratio": ratio}
    gen_conf = {"response_modalities": ["TEXT", "IMAGE"]}
    
    if "gemini-3" in model_name:
        img_conf["image_size"] = resolution
        # <--- AJOUT : Activation du Thinking Mode
        gen_conf["thinking_config"] = types.ThinkingConfig(include_thoughts=True)
        if grounding:
            gen_conf["tools"] = [{"google_search": {}}]
    
    gen_conf["image_config"] = types.ImageConfig(**img_conf)

    try:
        print(f"🚀 Sending request [T2I]...")
        response = cli.models.generate_content(
            model=model_name,
            contents=[prompt],
            config=types.GenerateContentConfig(**gen_conf)
        )
        # Retourne maintenant 5 éléments (avec sources)
        return process_response(response)
    except Exception as e:
        raise gr.Error(f"API Error: {str(e)}")

def generate_composition(prompt, files, model_ui, ratio, resolution, grounding, user_api_key):
    """Composition I2I (Supporte jusqu'à 14 images selon la doc)"""
    cli = get_client(user_api_key)
    model_name = MODELS[model_ui]
    
    if not files: raise gr.Error("No input images provided.")

    contents = [prompt]
    for p in files:
        try:
            contents.append(Image.open(p))
        except: pass

    img_conf = {"aspect_ratio": ratio}
    gen_conf = {"response_modalities": ["TEXT", "IMAGE"]}

    if "gemini-3" in model_name:
        img_conf["image_size"] = resolution
        # <--- AJOUT : Activation du Thinking Mode aussi ici par sécurité
        gen_conf["thinking_config"] = types.ThinkingConfig(include_thoughts=True)
        # <--- AJOUT : Grounding
        if grounding:
             gen_conf["tools"] = [{"google_search": {}}]

    gen_conf["image_config"] = types.ImageConfig(**img_conf)

    try:
        print(f"🚀 Sending request [I2I]")
        response = cli.models.generate_content(
            model=model_name,
            contents=contents,
            config=types.GenerateContentConfig(**gen_conf)
        )
        # Gestion des 5 valeurs de retour
        f_imgs, f_txt, t_imgs, t_txt, sources = process_response(response)
        
        full_text = f_txt
        # Ajout des sources et pensées au texte principal pour cet onglet
        if sources: full_text += f"\n\n{sources}" 
        if t_txt: full_text += f"\n\n{t_txt}"
        
        return f_imgs, full_text
    except Exception as e:
        raise gr.Error(f"Error: {str(e)}")

# --- CHAT LOGIC ---

def chat_respond(message, history, chat_history_data, img_input, model_ui, grounding, ratio, resolution, user_api_key):
    """Gestion du chat 'Stateless' conforme aux types Google GenAI"""
    
    if not user_api_key: raise gr.Error("API Key manquante")
    
    cli = get_client(user_api_key)
    model_name = MODELS[model_ui]
    
    tools = None
    thinking_conf = None # <--- AJOUT variable
    
    # Configuration Image
    img_conf = {"aspect_ratio": ratio}

    if "gemini-3" in model_name:
        img_conf["image_size"] = resolution
        # <--- AJOUT : Config Thinking
        thinking_conf = types.ThinkingConfig(include_thoughts=True)
        if grounding:
            tools = [{"google_search": {}}]

    # 1. Restauration de l'historique
    chat = cli.chats.create(
        model=model_name,
        config=types.GenerateContentConfig(
            response_modalities=['TEXT', 'IMAGE'],
            tools=tools,
            thinking_config=thinking_conf,
            image_config=types.ImageConfig(**img_conf) # <--- AJOUT Image Config dans Chat
        ),
        history=chat_history_data 
    )
    
    # 2. Préparation du contenu utilisateur
    send_contents = [message]
    if img_input:
        for img_path in img_input:
            send_contents.append(Image.open(img_path))

    user_display_text = message
    if img_input:
        user_display_text += f"\n\n🖼️ *({len(img_input)} Images attached)*"
    user_message_obj = {"role": "user", "content": user_display_text}
    
    try:
        # 3. Envoi au modèle
        response = chat.send_message(send_contents)
        # Récupération des 5 valeurs
        f_imgs, f_txt, t_imgs, t_txt, sources = process_response(response)
        
        # 4. Construction réponse UI
        bot_messages = []
        
        if t_txt or t_imgs:
            thought_md = "🧠 **Model Thought Process:**\n"
            if t_txt: thought_md += f"> {t_txt}\n"
            if t_imgs: thought_md += f"*( + {len(t_imgs)} draft image(s) not displayed)*\n"
            thought_md += "---\n"
            bot_messages.append({"role": "assistant", "content": thought_md})
            
        if f_txt:
            bot_messages.append({"role": "assistant", "content": f_txt})
        
        # <--- AJOUT : Affichage des sources dans le chat
        if sources:
            bot_messages.append({"role": "assistant", "content": sources})

        if f_imgs:
            for i, img in enumerate(f_imgs):
                unique_filename = f"chat_{uuid.uuid4()}_{i}.png"
                file_path = os.path.join(TEMP_CHAT_DIR, unique_filename)
                img.save(file_path)
                img_msg = {"path": file_path, "alt_text": "Generated Image"}
                bot_messages.append({"role": "assistant", "content": img_msg})
        
        if not f_txt and not f_imgs and not t_txt and not sources:
             bot_messages.append({"role": "assistant", "content": "⚠️ *Empty response.*"})

        # 5. Mise à jour de l'historique Gemini
        u_parts = [types.Part.from_text(text=message)]
        if img_input:
            for img_path in img_input:
                with open(img_path, "rb") as f:
                    img_bytes = f.read()
                u_parts.append(types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg"))
        user_content_obj = types.Content(role="user", parts=u_parts)

        model_content_obj = response.candidates[0].content

        current_data = chat_history_data if chat_history_data else []
        new_gemini_history = current_data + [user_content_obj, model_content_obj]
        
        new_ui_history = history + [user_message_obj] + bot_messages
        
        return "", None, new_ui_history, new_gemini_history, f_imgs
        
    except Exception as e:
        err_msg = f"❌ Error: {str(e)}"
        bot_err_obj = {"role": "assistant", "content": err_msg}
        return "", None, history + [user_message_obj, bot_err_obj], chat_history_data, []

def clear_chat():
    return [], None, []

# --- GRADIO INTERFACE ---

css = """
.container { max-width: 1200px; margin: auto; }
h1 { text-align: center; color: #4F46E5; font-size: 2.5em; }
.image-container img { max-height: 400px; width: auto; }
"""

# <--- CORRECTION : Suppression de 'css' et 'theme' ici
with gr.Blocks(title="Nano Vision Studio") as demo:
    
    gr.Markdown("# Nano 🍌 Vision Studio")
    gr.Markdown("### The Ultimate Interface: 4K Generation, Grounding, Multi-Image Composition & Iterative Chat")
    
    user_api_key_state = gr.State(os.environ.get("GOOGLE_API_KEY", ""))
    chat_state = gr.State(None)

    with gr.Tabs():
        # --- TAB 0 : API ---
        with gr.TabItem("🔑 API Settings"):
            gr.Markdown("### ⚙️ API Configuration")
            with gr.Row():
                with gr.Column(scale=3):
                    api_input = gr.Textbox(label="Google Gemini API Key", type="password", lines=1)
                with gr.Column(scale=1):
                    api_btn = gr.Button("Save & Initialize 💾", variant="primary")
            api_status = gr.Markdown()
            api_btn.click(update_api_key, inputs=[api_input], outputs=[api_status, user_api_key_state])
        
        # --- TAB 1 : STUDIO ---
        with gr.TabItem("🎨 Creation Studio"):
            with gr.Row():
                with gr.Column(scale=1):
                    t1_prompt = gr.Textbox(label="Prompt", lines=4, placeholder="Describe the scene...")
                    with gr.Group():
                        t1_model = gr.Dropdown(choices=list(MODELS.keys()), value="🧠 Gemini 3 Pro Preview (Recommended)", label="Model")
                        with gr.Row():
                            t1_ratio = gr.Dropdown(RATIOS, value="16:9", label="Aspect Ratio")
                            t1_res = gr.Dropdown(RESOLUTIONS, value="2K", label="Resolution (Pro only)")
                        t1_grounding = gr.Checkbox(label="Google Search (Grounding)")
                    t1_btn = gr.Button("Generate ✨", variant="primary", size="lg")

                with gr.Column(scale=2):
                    t1_gallery = gr.Gallery(label="Final Images", columns=2, height="auto")
                    # <--- AJOUT : Composant pour afficher les sources HTML
                    t1_sources = gr.HTML(label="Grounding Sources")
                    t1_text = gr.Markdown(label="Generated Text")
                    with gr.Accordion("🧠 Thought Process", open=False):
                        t1_thought_imgs = gr.Gallery(label="Visual Drafts", columns=4, height=150)
                        # Utilisation de Markdown pour un meilleur rendu du flux de pensée
                        t1_thought_txt = gr.Markdown(label="Thought Stream")

            t1_btn.click(
                generate_studio,
                inputs=[t1_prompt, t1_model, t1_ratio, t1_res, t1_grounding, user_api_key_state],
                outputs=[t1_gallery, t1_text, t1_thought_imgs, t1_thought_txt, t1_sources] # <--- Ajout t1_sources dans les outputs
            )

        # --- TAB 2 : COMPOSITION ---
        with gr.TabItem("🛠️ Composition"):
            with gr.Row():
                with gr.Column(scale=1):
                    t2_files = gr.File(label="Reference Images (Max 14)", file_count="multiple", type="filepath")
                    t2_prompt = gr.Textbox(label="Instructions", lines=3)
                    with gr.Accordion("Advanced Settings", open=False):
                        t2_model = gr.Dropdown(list(MODELS.keys()), value="🧠 Gemini 3 Pro Preview (Recommended)", label="Model")
                        with gr.Row():
                            t2_ratio = gr.Dropdown(RATIOS, value="1:1", label="Aspect Ratio")
                            t2_res = gr.Dropdown(RESOLUTIONS, value="1K", label="Output Resolution")
                        t2_grounding = gr.Checkbox(label="Google Search (Grounding)") # <--- AJOUT Grounding
                    t2_btn = gr.Button("Run", variant="primary")

                with gr.Column(scale=2):
                    t2_gallery = gr.Gallery(label="Result", columns=1)
                    t2_text = gr.Markdown()

            t2_btn.click(
                generate_composition,
                inputs=[t2_prompt, t2_files, t2_model, t2_ratio, t2_res, t2_grounding, user_api_key_state],
                outputs=[t2_gallery, t2_text]
            )
            
        # --- TAB 3 : CHAT ---
        with gr.TabItem("💬 Chat & Refinement"):
            with gr.Row():
                with gr.Column(scale=2):
                    # <--- CORRECTION : Suppression de 'type="messages"' ici
                    chat_history = gr.Chatbot(label="Session History", height=600)
                    with gr.Row():
                        chat_input = gr.Textbox(label="Your Message", scale=4)
                        # chat_img = gr.Image(label="Input Image", type="filepath", height=100)
                        chat_img = gr.File(label="Attach Images (Max 14)", file_count="multiple", type="filepath", height=100)
                    with gr.Row():
                        chat_btn = gr.Button("Send", variant="primary")
                        clear_btn = gr.Button("🗑️ New Session")
                    with gr.Accordion("Chat Options", open=False):
                        c_model = gr.Dropdown(list(MODELS.keys()), value="🧠 Gemini 3 Pro Preview (Recommended)", label="Model")
                        with gr.Row():
                            c_ratio = gr.Dropdown(RATIOS, value="16:9", label="Aspect Ratio") # <--- AJOUT Ratio
                            c_res = gr.Dropdown(RESOLUTIONS, value="2K", label="Resolution (Pro only)") # <--- AJOUT Resolution
                        c_grounding = gr.Checkbox(label="Grounding")

                with gr.Column(scale=1):
                    chat_gallery_zoom = gr.Gallery(label="Zoom", columns=1, height="auto")

            chat_btn.click(
                chat_respond,
                inputs=[chat_input, chat_history, chat_state, chat_img, c_model, c_grounding, c_ratio, c_res, user_api_key_state],
                outputs=[chat_input, chat_img, chat_history, chat_state, chat_gallery_zoom]
            )
            clear_btn.click(
                clear_chat,
                inputs=[],
                outputs=[chat_history, chat_state, chat_gallery_zoom]
            )
            
        # --- TAB 4 : GUIDE ---
        with gr.TabItem("📚 Guide"):
            gr.Markdown("""
            # Comprehensive Guide

            Welcome to the ultimate interface for **Nano Banana Pro** (Gemini 3 Pro) and **Nano Banana** (Gemini 2.5 Flash).

            ## 🚀 Choose Your Model

            | Feature | ⚡ Gemini 2.5 Flash (Nano Banana) | 🧠 Gemini 3 Pro (Nano Banana Pro) |
            | :--- | :--- | :--- |
            | **Best For** | Speed, High Volume, Prototyping | Professional Assets, Complex Logic, Text Rendering |
            | **Resolution** | 1024x1024 (Native) | Up to **4K** (High Fidelity) |
            | **Inputs** | Text + Images | Text + up to **14 Reference Images** |
            | **Special** | Fast & Efficient | **Thinking Process**, **Search Grounding** |

            ---

            ## ✨ Advanced Capabilities Explained

            ### 1. 🧠 The "Thinking" Process (Pro Only)
            Nano Banana Pro doesn't just draw; it *thinks*. Before generating pixels, it reasons through your prompt to understand composition, lighting, and logic.
            * **In this App:** Check the **"Thought Process"** accordion in the *Creation Studio* to read the model's internal monologue and see draft visualizations (Thought Images).

            ### 2. 🌍 Search Grounding (Real-Time Data)
            The model isn't stuck in the past. It can access **Google Search** to generate images based on live data.
            * **Try this:** "Visualize the current weather forecast for Tokyo as a modern chart."
            * **In this App:** Enable the **"Grounding"** checkbox. Sources will appear below the generated image.

            ### 3. 🖼️ Advanced Composition (up to 14 Images)
            While Flash handles fewer inputs, Pro can mix up to **14 images**!
            * **Use Case:** Style transfer, maintaining character consistency, or complex collages.
            * **How:** Use the **"Composition"** tab to upload multiple reference files.

            ---

            ## 💡 Prompting Masterclass

            To get the best results, follow these professional tips:

            * **Be Hyper-Specific:** Don't just say "a cat". Say *"A photorealistic close-up of a Siamese cat, golden hour lighting, captured with an 85mm lens"*.
            * **Provide Context:** Explain the intent. *"Create a logo for a high-end minimalist skincare brand"*.
            * **Positive Framing:** Instead of "no cars", describe the scene as *"an empty, deserted street"*.
            * **Iterate with Chat:** Don't expect perfection on turn #1. Use the **Chat & Refinement** tab to say *"Make the lighting warmer"* or *"Add a wizard hat"*.

            ## ⚡ Performance Tips
            * **4K Generation:** Available only on Pro. It costs more but delivers stunning print-quality results.
            * **Aspect Ratios:** We support everything from **1:1** to **21:9** (Cinematic).
            """)

if __name__ == "__main__":
    threading.Thread(target=cleanup_old_files, daemon=True).start()
    demo.queue(default_concurrency_limit=20)
    
    # <--- CORRECTION : Ajout de 'css' et 'theme' ici
    demo.launch(
        theme=gr.themes.Soft(), 
        css=css,
        max_threads=40,
        show_error=True,
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )