hasanbasbunar's picture
Update app.py
6205725 verified
import gradio as gr
from google import genai
from google.genai import types
from PIL import Image
import os
import io
import uuid
import threading
import time
from dotenv import load_dotenv
load_dotenv()
# --- CONFIGURATION CONFORME DOC ---
MODELS = {
"🧠 Gemini 3 Pro Preview (Recommended)": "gemini-3-pro-image-preview",
"⚡ Gemini 2.5 Flash (Fast)": "gemini-2.5-flash-image"
}
# Valeurs exactes demandées par la documentation
RATIOS = ["1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"]
RESOLUTIONS = ["1K", "2K", "4K"] # Uppercase 'K' mandatory per docs
TEMP_CHAT_DIR = "temp_chat_images"
os.makedirs(TEMP_CHAT_DIR, exist_ok=True)
# --- UTILS ---
def get_client(api_key):
if not api_key: raise gr.Error("API Key manquante")
return genai.Client(api_key=api_key)
def safe_process_image(part):
"""Convertit les données inline brutes en Image PIL."""
try:
if part.inline_data and hasattr(part.inline_data, 'data'):
return Image.open(io.BytesIO(part.inline_data.data))
if hasattr(part, 'as_image'):
img = part.as_image()
if hasattr(img, 'image'): return img.image
return img
return None
except Exception as e:
print(f"⚠️ Image conversion error: {e}")
return None
def process_response(response):
"""Sépare le résultat final, le processus de pensée (Thinking Mode) et les sources."""
final_imgs, final_txt = [], ""
thought_imgs, thought_txt = [], ""
sources_html = None # <--- AJOUT : Conteneur pour les sources
if not response or not response.parts:
return final_imgs, final_txt, thought_imgs, thought_txt, sources_html
print(f"\n--- RECEIVED ({len(response.parts)} parts) ---")
# 1. Parsing du contenu (Images & Texte & Pensées)
for i, part in enumerate(response.parts):
is_thought = getattr(part, 'thought', False)
if is_thought:
if part.text: thought_txt += part.text + "\n"
if part.inline_data:
img = safe_process_image(part)
if img: thought_imgs.append(img)
else:
if part.text: final_txt += part.text + "\n"
if part.inline_data:
img = safe_process_image(part)
if img: final_imgs.append(img)
# 2. Parsing du Grounding (Sources) - AJOUT
# Les métadonnées sont au niveau du 'candidate', pas des 'parts'
if response.candidates and response.candidates[0].grounding_metadata:
gm = response.candidates[0].grounding_metadata
if gm.search_entry_point and gm.search_entry_point.rendered_content:
sources_html = gm.search_entry_point.rendered_content
return final_imgs, final_txt, thought_imgs, thought_txt, sources_html
# --- WORKER NETTOYAGE ---
def cleanup_old_files():
"""Supprime les fichiers vieux de plus de 1h toutes les 10 minutes."""
while True:
try:
now = time.time()
cutoff = now - 3600 # 1 heure
if os.path.exists(TEMP_CHAT_DIR):
for filename in os.listdir(TEMP_CHAT_DIR):
filepath = os.path.join(TEMP_CHAT_DIR, filename)
if os.path.isfile(filepath):
if os.path.getmtime(filepath) < cutoff:
try:
os.remove(filepath)
print(f"🧹 Supprimé : {filename}")
except Exception:
pass
except Exception as e:
print(f"⚠️ Erreur worker : {e}")
time.sleep(600)
# --- BACKEND FUNCTIONS ---
def update_api_key(new_key):
if not new_key:
return "⚠️ Clé invalide", None
return "✅ Clé enregistrée pour cette session !", new_key
def generate_studio(prompt, model_ui, ratio, resolution, grounding, user_api_key):
"""Text-to-Image standard respectant types.GenerateContentConfig"""
cli = get_client(user_api_key)
model_name = MODELS[model_ui]
# Configuration stricte selon la doc
img_conf = {"aspect_ratio": ratio}
gen_conf = {"response_modalities": ["TEXT", "IMAGE"]}
if "gemini-3" in model_name:
img_conf["image_size"] = resolution
# <--- AJOUT : Activation du Thinking Mode
gen_conf["thinking_config"] = types.ThinkingConfig(include_thoughts=True)
if grounding:
gen_conf["tools"] = [{"google_search": {}}]
gen_conf["image_config"] = types.ImageConfig(**img_conf)
try:
print(f"🚀 Sending request [T2I]...")
response = cli.models.generate_content(
model=model_name,
contents=[prompt],
config=types.GenerateContentConfig(**gen_conf)
)
# Retourne maintenant 5 éléments (avec sources)
return process_response(response)
except Exception as e:
raise gr.Error(f"API Error: {str(e)}")
def generate_composition(prompt, files, model_ui, ratio, resolution, grounding, user_api_key):
"""Composition I2I (Supporte jusqu'à 14 images selon la doc)"""
cli = get_client(user_api_key)
model_name = MODELS[model_ui]
if not files: raise gr.Error("No input images provided.")
contents = [prompt]
for p in files:
try:
contents.append(Image.open(p))
except: pass
img_conf = {"aspect_ratio": ratio}
gen_conf = {"response_modalities": ["TEXT", "IMAGE"]}
if "gemini-3" in model_name:
img_conf["image_size"] = resolution
# <--- AJOUT : Activation du Thinking Mode aussi ici par sécurité
gen_conf["thinking_config"] = types.ThinkingConfig(include_thoughts=True)
# <--- AJOUT : Grounding
if grounding:
gen_conf["tools"] = [{"google_search": {}}]
gen_conf["image_config"] = types.ImageConfig(**img_conf)
try:
print(f"🚀 Sending request [I2I]")
response = cli.models.generate_content(
model=model_name,
contents=contents,
config=types.GenerateContentConfig(**gen_conf)
)
# Gestion des 5 valeurs de retour
f_imgs, f_txt, t_imgs, t_txt, sources = process_response(response)
full_text = f_txt
# Ajout des sources et pensées au texte principal pour cet onglet
if sources: full_text += f"\n\n{sources}"
if t_txt: full_text += f"\n\n{t_txt}"
return f_imgs, full_text
except Exception as e:
raise gr.Error(f"Error: {str(e)}")
# --- CHAT LOGIC ---
def chat_respond(message, history, chat_history_data, img_input, model_ui, grounding, ratio, resolution, user_api_key):
"""Gestion du chat 'Stateless' conforme aux types Google GenAI"""
if not user_api_key: raise gr.Error("API Key manquante")
cli = get_client(user_api_key)
model_name = MODELS[model_ui]
tools = None
thinking_conf = None # <--- AJOUT variable
# Configuration Image
img_conf = {"aspect_ratio": ratio}
if "gemini-3" in model_name:
img_conf["image_size"] = resolution
# <--- AJOUT : Config Thinking
thinking_conf = types.ThinkingConfig(include_thoughts=True)
if grounding:
tools = [{"google_search": {}}]
# 1. Restauration de l'historique
chat = cli.chats.create(
model=model_name,
config=types.GenerateContentConfig(
response_modalities=['TEXT', 'IMAGE'],
tools=tools,
thinking_config=thinking_conf,
image_config=types.ImageConfig(**img_conf) # <--- AJOUT Image Config dans Chat
),
history=chat_history_data
)
# 2. Préparation du contenu utilisateur
send_contents = [message]
if img_input:
for img_path in img_input:
send_contents.append(Image.open(img_path))
user_display_text = message
if img_input:
user_display_text += f"\n\n🖼️ *({len(img_input)} Images attached)*"
user_message_obj = {"role": "user", "content": user_display_text}
try:
# 3. Envoi au modèle
response = chat.send_message(send_contents)
# Récupération des 5 valeurs
f_imgs, f_txt, t_imgs, t_txt, sources = process_response(response)
# 4. Construction réponse UI
bot_messages = []
if t_txt or t_imgs:
thought_md = "🧠 **Model Thought Process:**\n"
if t_txt: thought_md += f"> {t_txt}\n"
if t_imgs: thought_md += f"*( + {len(t_imgs)} draft image(s) not displayed)*\n"
thought_md += "---\n"
bot_messages.append({"role": "assistant", "content": thought_md})
if f_txt:
bot_messages.append({"role": "assistant", "content": f_txt})
# <--- AJOUT : Affichage des sources dans le chat
if sources:
bot_messages.append({"role": "assistant", "content": sources})
if f_imgs:
for i, img in enumerate(f_imgs):
unique_filename = f"chat_{uuid.uuid4()}_{i}.png"
file_path = os.path.join(TEMP_CHAT_DIR, unique_filename)
img.save(file_path)
img_msg = {"path": file_path, "alt_text": "Generated Image"}
bot_messages.append({"role": "assistant", "content": img_msg})
if not f_txt and not f_imgs and not t_txt and not sources:
bot_messages.append({"role": "assistant", "content": "⚠️ *Empty response.*"})
# 5. Mise à jour de l'historique Gemini
u_parts = [types.Part.from_text(text=message)]
if img_input:
for img_path in img_input:
with open(img_path, "rb") as f:
img_bytes = f.read()
u_parts.append(types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg"))
user_content_obj = types.Content(role="user", parts=u_parts)
model_content_obj = response.candidates[0].content
current_data = chat_history_data if chat_history_data else []
new_gemini_history = current_data + [user_content_obj, model_content_obj]
new_ui_history = history + [user_message_obj] + bot_messages
return "", None, new_ui_history, new_gemini_history, f_imgs
except Exception as e:
err_msg = f"❌ Error: {str(e)}"
bot_err_obj = {"role": "assistant", "content": err_msg}
return "", None, history + [user_message_obj, bot_err_obj], chat_history_data, []
def clear_chat():
return [], None, []
# --- GRADIO INTERFACE ---
css = """
.container { max-width: 1200px; margin: auto; }
h1 { text-align: center; color: #4F46E5; font-size: 2.5em; }
.image-container img { max-height: 400px; width: auto; }
"""
# <--- CORRECTION : Suppression de 'css' et 'theme' ici
with gr.Blocks(title="Nano Vision Studio") as demo:
gr.Markdown("# Nano 🍌 Vision Studio")
gr.Markdown("### The Ultimate Interface: 4K Generation, Grounding, Multi-Image Composition & Iterative Chat")
user_api_key_state = gr.State(os.environ.get("GOOGLE_API_KEY", ""))
chat_state = gr.State(None)
with gr.Tabs():
# --- TAB 0 : API ---
with gr.TabItem("🔑 API Settings"):
gr.Markdown("### ⚙️ API Configuration")
with gr.Row():
with gr.Column(scale=3):
api_input = gr.Textbox(label="Google Gemini API Key", type="password", lines=1)
with gr.Column(scale=1):
api_btn = gr.Button("Save & Initialize 💾", variant="primary")
api_status = gr.Markdown()
api_btn.click(update_api_key, inputs=[api_input], outputs=[api_status, user_api_key_state])
# --- TAB 1 : STUDIO ---
with gr.TabItem("🎨 Creation Studio"):
with gr.Row():
with gr.Column(scale=1):
t1_prompt = gr.Textbox(label="Prompt", lines=4, placeholder="Describe the scene...")
with gr.Group():
t1_model = gr.Dropdown(choices=list(MODELS.keys()), value="🧠 Gemini 3 Pro Preview (Recommended)", label="Model")
with gr.Row():
t1_ratio = gr.Dropdown(RATIOS, value="16:9", label="Aspect Ratio")
t1_res = gr.Dropdown(RESOLUTIONS, value="2K", label="Resolution (Pro only)")
t1_grounding = gr.Checkbox(label="Google Search (Grounding)")
t1_btn = gr.Button("Generate ✨", variant="primary", size="lg")
with gr.Column(scale=2):
t1_gallery = gr.Gallery(label="Final Images", columns=2, height="auto")
# <--- AJOUT : Composant pour afficher les sources HTML
t1_sources = gr.HTML(label="Grounding Sources")
t1_text = gr.Markdown(label="Generated Text")
with gr.Accordion("🧠 Thought Process", open=False):
t1_thought_imgs = gr.Gallery(label="Visual Drafts", columns=4, height=150)
# Utilisation de Markdown pour un meilleur rendu du flux de pensée
t1_thought_txt = gr.Markdown(label="Thought Stream")
t1_btn.click(
generate_studio,
inputs=[t1_prompt, t1_model, t1_ratio, t1_res, t1_grounding, user_api_key_state],
outputs=[t1_gallery, t1_text, t1_thought_imgs, t1_thought_txt, t1_sources] # <--- Ajout t1_sources dans les outputs
)
# --- TAB 2 : COMPOSITION ---
with gr.TabItem("🛠️ Composition"):
with gr.Row():
with gr.Column(scale=1):
t2_files = gr.File(label="Reference Images (Max 14)", file_count="multiple", type="filepath")
t2_prompt = gr.Textbox(label="Instructions", lines=3)
with gr.Accordion("Advanced Settings", open=False):
t2_model = gr.Dropdown(list(MODELS.keys()), value="🧠 Gemini 3 Pro Preview (Recommended)", label="Model")
with gr.Row():
t2_ratio = gr.Dropdown(RATIOS, value="1:1", label="Aspect Ratio")
t2_res = gr.Dropdown(RESOLUTIONS, value="1K", label="Output Resolution")
t2_grounding = gr.Checkbox(label="Google Search (Grounding)") # <--- AJOUT Grounding
t2_btn = gr.Button("Run", variant="primary")
with gr.Column(scale=2):
t2_gallery = gr.Gallery(label="Result", columns=1)
t2_text = gr.Markdown()
t2_btn.click(
generate_composition,
inputs=[t2_prompt, t2_files, t2_model, t2_ratio, t2_res, t2_grounding, user_api_key_state],
outputs=[t2_gallery, t2_text]
)
# --- TAB 3 : CHAT ---
with gr.TabItem("💬 Chat & Refinement"):
with gr.Row():
with gr.Column(scale=2):
# <--- CORRECTION : Suppression de 'type="messages"' ici
chat_history = gr.Chatbot(label="Session History", height=600)
with gr.Row():
chat_input = gr.Textbox(label="Your Message", scale=4)
# chat_img = gr.Image(label="Input Image", type="filepath", height=100)
chat_img = gr.File(label="Attach Images (Max 14)", file_count="multiple", type="filepath", height=100)
with gr.Row():
chat_btn = gr.Button("Send", variant="primary")
clear_btn = gr.Button("🗑️ New Session")
with gr.Accordion("Chat Options", open=False):
c_model = gr.Dropdown(list(MODELS.keys()), value="🧠 Gemini 3 Pro Preview (Recommended)", label="Model")
with gr.Row():
c_ratio = gr.Dropdown(RATIOS, value="16:9", label="Aspect Ratio") # <--- AJOUT Ratio
c_res = gr.Dropdown(RESOLUTIONS, value="2K", label="Resolution (Pro only)") # <--- AJOUT Resolution
c_grounding = gr.Checkbox(label="Grounding")
with gr.Column(scale=1):
chat_gallery_zoom = gr.Gallery(label="Zoom", columns=1, height="auto")
chat_btn.click(
chat_respond,
inputs=[chat_input, chat_history, chat_state, chat_img, c_model, c_grounding, c_ratio, c_res, user_api_key_state],
outputs=[chat_input, chat_img, chat_history, chat_state, chat_gallery_zoom]
)
clear_btn.click(
clear_chat,
inputs=[],
outputs=[chat_history, chat_state, chat_gallery_zoom]
)
# --- TAB 4 : GUIDE ---
with gr.TabItem("📚 Guide"):
gr.Markdown("""
# Comprehensive Guide
Welcome to the ultimate interface for **Nano Banana Pro** (Gemini 3 Pro) and **Nano Banana** (Gemini 2.5 Flash).
## 🚀 Choose Your Model
| Feature | ⚡ Gemini 2.5 Flash (Nano Banana) | 🧠 Gemini 3 Pro (Nano Banana Pro) |
| :--- | :--- | :--- |
| **Best For** | Speed, High Volume, Prototyping | Professional Assets, Complex Logic, Text Rendering |
| **Resolution** | 1024x1024 (Native) | Up to **4K** (High Fidelity) |
| **Inputs** | Text + Images | Text + up to **14 Reference Images** |
| **Special** | Fast & Efficient | **Thinking Process**, **Search Grounding** |
---
## ✨ Advanced Capabilities Explained
### 1. 🧠 The "Thinking" Process (Pro Only)
Nano Banana Pro doesn't just draw; it *thinks*. Before generating pixels, it reasons through your prompt to understand composition, lighting, and logic.
* **In this App:** Check the **"Thought Process"** accordion in the *Creation Studio* to read the model's internal monologue and see draft visualizations (Thought Images).
### 2. 🌍 Search Grounding (Real-Time Data)
The model isn't stuck in the past. It can access **Google Search** to generate images based on live data.
* **Try this:** "Visualize the current weather forecast for Tokyo as a modern chart."
* **In this App:** Enable the **"Grounding"** checkbox. Sources will appear below the generated image.
### 3. 🖼️ Advanced Composition (up to 14 Images)
While Flash handles fewer inputs, Pro can mix up to **14 images**!
* **Use Case:** Style transfer, maintaining character consistency, or complex collages.
* **How:** Use the **"Composition"** tab to upload multiple reference files.
---
## 💡 Prompting Masterclass
To get the best results, follow these professional tips:
* **Be Hyper-Specific:** Don't just say "a cat". Say *"A photorealistic close-up of a Siamese cat, golden hour lighting, captured with an 85mm lens"*.
* **Provide Context:** Explain the intent. *"Create a logo for a high-end minimalist skincare brand"*.
* **Positive Framing:** Instead of "no cars", describe the scene as *"an empty, deserted street"*.
* **Iterate with Chat:** Don't expect perfection on turn #1. Use the **Chat & Refinement** tab to say *"Make the lighting warmer"* or *"Add a wizard hat"*.
## ⚡ Performance Tips
* **4K Generation:** Available only on Pro. It costs more but delivers stunning print-quality results.
* **Aspect Ratios:** We support everything from **1:1** to **21:9** (Cinematic).
""")
if __name__ == "__main__":
threading.Thread(target=cleanup_old_files, daemon=True).start()
demo.queue(default_concurrency_limit=20)
# <--- CORRECTION : Ajout de 'css' et 'theme' ici
demo.launch(
theme=gr.themes.Soft(),
css=css,
max_threads=40,
show_error=True,
server_name="0.0.0.0",
server_port=7860,
share=False
)