Spaces:

hasanbasbunar
/

Nano-Vision-Studio

Running

App Files Files Community

Nano-Vision-Studio / app.py

hasanbasbunar

Update app.py

6205725 verified 27 days ago

raw

history blame contribute delete

20.4 kB

	import gradio as gr
	from google import genai
	from google.genai import types
	from PIL import Image
	import os
	import io
	import uuid
	import threading
	import time
	from dotenv import load_dotenv

	load_dotenv()

	# --- CONFIGURATION CONFORME DOC ---
	MODELS = {
	"🧠 Gemini 3 Pro Preview (Recommended)": "gemini-3-pro-image-preview",
	"⚡ Gemini 2.5 Flash (Fast)": "gemini-2.5-flash-image"
	}

	# Valeurs exactes demandées par la documentation
	RATIOS = ["1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"]
	RESOLUTIONS = ["1K", "2K", "4K"] # Uppercase 'K' mandatory per docs

	TEMP_CHAT_DIR = "temp_chat_images"
	os.makedirs(TEMP_CHAT_DIR, exist_ok=True)

	# --- UTILS ---

	def get_client(api_key):
	if not api_key: raise gr.Error("API Key manquante")
	return genai.Client(api_key=api_key)

	def safe_process_image(part):
	"""Convertit les données inline brutes en Image PIL."""
	try:
	if part.inline_data and hasattr(part.inline_data, 'data'):
	return Image.open(io.BytesIO(part.inline_data.data))
	if hasattr(part, 'as_image'):
	img = part.as_image()
	if hasattr(img, 'image'): return img.image
	return img
	return None
	except Exception as e:
	print(f"⚠️ Image conversion error: {e}")
	return None

	def process_response(response):
	"""Sépare le résultat final, le processus de pensée (Thinking Mode) et les sources."""
	final_imgs, final_txt = [], ""
	thought_imgs, thought_txt = [], ""
	sources_html = None # <--- AJOUT : Conteneur pour les sources

	if not response or not response.parts:
	return final_imgs, final_txt, thought_imgs, thought_txt, sources_html

	print(f"\n--- RECEIVED ({len(response.parts)} parts) ---")

	# 1. Parsing du contenu (Images & Texte & Pensées)
	for i, part in enumerate(response.parts):
	is_thought = getattr(part, 'thought', False)

	if is_thought:
	if part.text: thought_txt += part.text + "\n"
	if part.inline_data:
	img = safe_process_image(part)
	if img: thought_imgs.append(img)
	else:
	if part.text: final_txt += part.text + "\n"
	if part.inline_data:
	img = safe_process_image(part)
	if img: final_imgs.append(img)

	# 2. Parsing du Grounding (Sources) - AJOUT
	# Les métadonnées sont au niveau du 'candidate', pas des 'parts'
	if response.candidates and response.candidates[0].grounding_metadata:
	gm = response.candidates[0].grounding_metadata
	if gm.search_entry_point and gm.search_entry_point.rendered_content:
	sources_html = gm.search_entry_point.rendered_content

	return final_imgs, final_txt, thought_imgs, thought_txt, sources_html

	# --- WORKER NETTOYAGE ---

	def cleanup_old_files():
	"""Supprime les fichiers vieux de plus de 1h toutes les 10 minutes."""
	while True:
	try:
	now = time.time()
	cutoff = now - 3600 # 1 heure
	if os.path.exists(TEMP_CHAT_DIR):
	for filename in os.listdir(TEMP_CHAT_DIR):
	filepath = os.path.join(TEMP_CHAT_DIR, filename)
	if os.path.isfile(filepath):
	if os.path.getmtime(filepath) < cutoff:
	try:
	os.remove(filepath)
	print(f"🧹 Supprimé : {filename}")
	except Exception:
	pass
	except Exception as e:
	print(f"⚠️ Erreur worker : {e}")
	time.sleep(600)

	# --- BACKEND FUNCTIONS ---

	def update_api_key(new_key):
	if not new_key:
	return "⚠️ Clé invalide", None
	return "✅ Clé enregistrée pour cette session !", new_key

	def generate_studio(prompt, model_ui, ratio, resolution, grounding, user_api_key):
	"""Text-to-Image standard respectant types.GenerateContentConfig"""
	cli = get_client(user_api_key)
	model_name = MODELS[model_ui]

	# Configuration stricte selon la doc
	img_conf = {"aspect_ratio": ratio}
	gen_conf = {"response_modalities": ["TEXT", "IMAGE"]}

	if "gemini-3" in model_name:
	img_conf["image_size"] = resolution
	# <--- AJOUT : Activation du Thinking Mode
	gen_conf["thinking_config"] = types.ThinkingConfig(include_thoughts=True)
	if grounding:
	gen_conf["tools"] = [{"google_search": {}}]

	gen_conf["image_config"] = types.ImageConfig(**img_conf)

	try:
	print(f"🚀 Sending request [T2I]...")
	response = cli.models.generate_content(
	model=model_name,
	contents=[prompt],
	config=types.GenerateContentConfig(**gen_conf)
	)
	# Retourne maintenant 5 éléments (avec sources)
	return process_response(response)
	except Exception as e:
	raise gr.Error(f"API Error: {str(e)}")

	def generate_composition(prompt, files, model_ui, ratio, resolution, grounding, user_api_key):
	"""Composition I2I (Supporte jusqu'à 14 images selon la doc)"""
	cli = get_client(user_api_key)
	model_name = MODELS[model_ui]

	if not files: raise gr.Error("No input images provided.")

	contents = [prompt]
	for p in files:
	try:
	contents.append(Image.open(p))
	except: pass

	img_conf = {"aspect_ratio": ratio}
	gen_conf = {"response_modalities": ["TEXT", "IMAGE"]}

	if "gemini-3" in model_name:
	img_conf["image_size"] = resolution
	# <--- AJOUT : Activation du Thinking Mode aussi ici par sécurité
	gen_conf["thinking_config"] = types.ThinkingConfig(include_thoughts=True)
	# <--- AJOUT : Grounding
	if grounding:
	gen_conf["tools"] = [{"google_search": {}}]

	gen_conf["image_config"] = types.ImageConfig(**img_conf)

	try:
	print(f"🚀 Sending request [I2I]")
	response = cli.models.generate_content(
	model=model_name,
	contents=contents,
	config=types.GenerateContentConfig(**gen_conf)
	)
	# Gestion des 5 valeurs de retour
	f_imgs, f_txt, t_imgs, t_txt, sources = process_response(response)

	full_text = f_txt
	# Ajout des sources et pensées au texte principal pour cet onglet
	if sources: full_text += f"\n\n{sources}"
	if t_txt: full_text += f"\n\n{t_txt}"

	return f_imgs, full_text
	except Exception as e:
	raise gr.Error(f"Error: {str(e)}")

	# --- CHAT LOGIC ---

	def chat_respond(message, history, chat_history_data, img_input, model_ui, grounding, ratio, resolution, user_api_key):
	"""Gestion du chat 'Stateless' conforme aux types Google GenAI"""

	if not user_api_key: raise gr.Error("API Key manquante")

	cli = get_client(user_api_key)
	model_name = MODELS[model_ui]

	tools = None
	thinking_conf = None # <--- AJOUT variable

	# Configuration Image
	img_conf = {"aspect_ratio": ratio}

	if "gemini-3" in model_name:
	img_conf["image_size"] = resolution
	# <--- AJOUT : Config Thinking
	thinking_conf = types.ThinkingConfig(include_thoughts=True)
	if grounding:
	tools = [{"google_search": {}}]

	# 1. Restauration de l'historique
	chat = cli.chats.create(
	model=model_name,
	config=types.GenerateContentConfig(
	response_modalities=['TEXT', 'IMAGE'],
	tools=tools,
	thinking_config=thinking_conf,
	image_config=types.ImageConfig(**img_conf) # <--- AJOUT Image Config dans Chat
	),
	history=chat_history_data
	)

	# 2. Préparation du contenu utilisateur
	send_contents = [message]
	if img_input:
	for img_path in img_input:
	send_contents.append(Image.open(img_path))

	user_display_text = message
	if img_input:
	user_display_text += f"\n\n🖼️ ({len(img_input)} Images attached)"
	user_message_obj = {"role": "user", "content": user_display_text}

	try:
	# 3. Envoi au modèle
	response = chat.send_message(send_contents)
	# Récupération des 5 valeurs
	f_imgs, f_txt, t_imgs, t_txt, sources = process_response(response)

	# 4. Construction réponse UI
	bot_messages = []

	if t_txt or t_imgs:
	thought_md = "🧠 Model Thought Process:\n"
	if t_txt: thought_md += f"> {t_txt}\n"
	if t_imgs: thought_md += f"( + {len(t_imgs)} draft image(s) not displayed)\n"
	thought_md += "---\n"
	bot_messages.append({"role": "assistant", "content": thought_md})

	if f_txt:
	bot_messages.append({"role": "assistant", "content": f_txt})

	# <--- AJOUT : Affichage des sources dans le chat
	if sources:
	bot_messages.append({"role": "assistant", "content": sources})

	if f_imgs:
	for i, img in enumerate(f_imgs):
	unique_filename = f"chat_{uuid.uuid4()}_{i}.png"
	file_path = os.path.join(TEMP_CHAT_DIR, unique_filename)
	img.save(file_path)
	img_msg = {"path": file_path, "alt_text": "Generated Image"}
	bot_messages.append({"role": "assistant", "content": img_msg})

	if not f_txt and not f_imgs and not t_txt and not sources:
	bot_messages.append({"role": "assistant", "content": "⚠️ Empty response."})

	# 5. Mise à jour de l'historique Gemini
	u_parts = [types.Part.from_text(text=message)]
	if img_input:
	for img_path in img_input:
	with open(img_path, "rb") as f:
	img_bytes = f.read()
	u_parts.append(types.Part.from_bytes(data=img_bytes, mime_type="image/jpeg"))
	user_content_obj = types.Content(role="user", parts=u_parts)

	model_content_obj = response.candidates[0].content

	current_data = chat_history_data if chat_history_data else []
	new_gemini_history = current_data + [user_content_obj, model_content_obj]

	new_ui_history = history + [user_message_obj] + bot_messages

	return "", None, new_ui_history, new_gemini_history, f_imgs

	except Exception as e:
	err_msg = f"❌ Error: {str(e)}"
	bot_err_obj = {"role": "assistant", "content": err_msg}
	return "", None, history + [user_message_obj, bot_err_obj], chat_history_data, []

	def clear_chat():
	return [], None, []

	# --- GRADIO INTERFACE ---

	css = """
	.container { max-width: 1200px; margin: auto; }
	h1 { text-align: center; color: #4F46E5; font-size: 2.5em; }
	.image-container img { max-height: 400px; width: auto; }
	"""

	# <--- CORRECTION : Suppression de 'css' et 'theme' ici
	with gr.Blocks(title="Nano Vision Studio") as demo:

	gr.Markdown("# Nano 🍌 Vision Studio")
	gr.Markdown("### The Ultimate Interface: 4K Generation, Grounding, Multi-Image Composition & Iterative Chat")

	user_api_key_state = gr.State(os.environ.get("GOOGLE_API_KEY", ""))
	chat_state = gr.State(None)

	with gr.Tabs():
	# --- TAB 0 : API ---
	with gr.TabItem("🔑 API Settings"):
	gr.Markdown("### ⚙️ API Configuration")
	with gr.Row():
	with gr.Column(scale=3):
	api_input = gr.Textbox(label="Google Gemini API Key", type="password", lines=1)
	with gr.Column(scale=1):
	api_btn = gr.Button("Save & Initialize 💾", variant="primary")
	api_status = gr.Markdown()
	api_btn.click(update_api_key, inputs=[api_input], outputs=[api_status, user_api_key_state])

	# --- TAB 1 : STUDIO ---
	with gr.TabItem("🎨 Creation Studio"):
	with gr.Row():
	with gr.Column(scale=1):
	t1_prompt = gr.Textbox(label="Prompt", lines=4, placeholder="Describe the scene...")
	with gr.Group():
	t1_model = gr.Dropdown(choices=list(MODELS.keys()), value="🧠 Gemini 3 Pro Preview (Recommended)", label="Model")
	with gr.Row():
	t1_ratio = gr.Dropdown(RATIOS, value="16:9", label="Aspect Ratio")
	t1_res = gr.Dropdown(RESOLUTIONS, value="2K", label="Resolution (Pro only)")
	t1_grounding = gr.Checkbox(label="Google Search (Grounding)")
	t1_btn = gr.Button("Generate ✨", variant="primary", size="lg")

	with gr.Column(scale=2):
	t1_gallery = gr.Gallery(label="Final Images", columns=2, height="auto")
	# <--- AJOUT : Composant pour afficher les sources HTML
	t1_sources = gr.HTML(label="Grounding Sources")
	t1_text = gr.Markdown(label="Generated Text")
	with gr.Accordion("🧠 Thought Process", open=False):
	t1_thought_imgs = gr.Gallery(label="Visual Drafts", columns=4, height=150)
	# Utilisation de Markdown pour un meilleur rendu du flux de pensée
	t1_thought_txt = gr.Markdown(label="Thought Stream")

	t1_btn.click(
	generate_studio,
	inputs=[t1_prompt, t1_model, t1_ratio, t1_res, t1_grounding, user_api_key_state],
	outputs=[t1_gallery, t1_text, t1_thought_imgs, t1_thought_txt, t1_sources] # <--- Ajout t1_sources dans les outputs
	)

	# --- TAB 2 : COMPOSITION ---
	with gr.TabItem("🛠️ Composition"):
	with gr.Row():
	with gr.Column(scale=1):
	t2_files = gr.File(label="Reference Images (Max 14)", file_count="multiple", type="filepath")
	t2_prompt = gr.Textbox(label="Instructions", lines=3)
	with gr.Accordion("Advanced Settings", open=False):
	t2_model = gr.Dropdown(list(MODELS.keys()), value="🧠 Gemini 3 Pro Preview (Recommended)", label="Model")
	with gr.Row():
	t2_ratio = gr.Dropdown(RATIOS, value="1:1", label="Aspect Ratio")
	t2_res = gr.Dropdown(RESOLUTIONS, value="1K", label="Output Resolution")
	t2_grounding = gr.Checkbox(label="Google Search (Grounding)") # <--- AJOUT Grounding
	t2_btn = gr.Button("Run", variant="primary")

	with gr.Column(scale=2):
	t2_gallery = gr.Gallery(label="Result", columns=1)
	t2_text = gr.Markdown()

	t2_btn.click(
	generate_composition,
	inputs=[t2_prompt, t2_files, t2_model, t2_ratio, t2_res, t2_grounding, user_api_key_state],
	outputs=[t2_gallery, t2_text]
	)

	# --- TAB 3 : CHAT ---
	with gr.TabItem("💬 Chat & Refinement"):
	with gr.Row():
	with gr.Column(scale=2):
	# <--- CORRECTION : Suppression de 'type="messages"' ici
	chat_history = gr.Chatbot(label="Session History", height=600)
	with gr.Row():
	chat_input = gr.Textbox(label="Your Message", scale=4)
	# chat_img = gr.Image(label="Input Image", type="filepath", height=100)
	chat_img = gr.File(label="Attach Images (Max 14)", file_count="multiple", type="filepath", height=100)
	with gr.Row():
	chat_btn = gr.Button("Send", variant="primary")
	clear_btn = gr.Button("🗑️ New Session")
	with gr.Accordion("Chat Options", open=False):
	c_model = gr.Dropdown(list(MODELS.keys()), value="🧠 Gemini 3 Pro Preview (Recommended)", label="Model")
	with gr.Row():
	c_ratio = gr.Dropdown(RATIOS, value="16:9", label="Aspect Ratio") # <--- AJOUT Ratio
	c_res = gr.Dropdown(RESOLUTIONS, value="2K", label="Resolution (Pro only)") # <--- AJOUT Resolution
	c_grounding = gr.Checkbox(label="Grounding")

	with gr.Column(scale=1):
	chat_gallery_zoom = gr.Gallery(label="Zoom", columns=1, height="auto")

	chat_btn.click(
	chat_respond,
	inputs=[chat_input, chat_history, chat_state, chat_img, c_model, c_grounding, c_ratio, c_res, user_api_key_state],
	outputs=[chat_input, chat_img, chat_history, chat_state, chat_gallery_zoom]
	)
	clear_btn.click(
	clear_chat,
	inputs=[],
	outputs=[chat_history, chat_state, chat_gallery_zoom]
	)

	# --- TAB 4 : GUIDE ---
	with gr.TabItem("📚 Guide"):
	gr.Markdown("""
	# Comprehensive Guide

	Welcome to the ultimate interface for Nano Banana Pro (Gemini 3 Pro) and Nano Banana (Gemini 2.5 Flash).

	## 🚀 Choose Your Model

	\| Feature \| ⚡ Gemini 2.5 Flash (Nano Banana) \| 🧠 Gemini 3 Pro (Nano Banana Pro) \|
	\| :--- \| :--- \| :--- \|
	\| Best For \| Speed, High Volume, Prototyping \| Professional Assets, Complex Logic, Text Rendering \|
	\| Resolution \| 1024x1024 (Native) \| Up to 4K (High Fidelity) \|
	\| Inputs \| Text + Images \| Text + up to 14 Reference Images \|
	\| Special \| Fast & Efficient \| Thinking Process, Search Grounding \|

	---

	## ✨ Advanced Capabilities Explained

	### 1. 🧠 The "Thinking" Process (Pro Only)
	Nano Banana Pro doesn't just draw; it thinks. Before generating pixels, it reasons through your prompt to understand composition, lighting, and logic.
	* In this App: Check the "Thought Process" accordion in the Creation Studio to read the model's internal monologue and see draft visualizations (Thought Images).

	### 2. 🌍 Search Grounding (Real-Time Data)
	The model isn't stuck in the past. It can access Google Search to generate images based on live data.
	* Try this: "Visualize the current weather forecast for Tokyo as a modern chart."
	* In this App: Enable the "Grounding" checkbox. Sources will appear below the generated image.

	### 3. 🖼️ Advanced Composition (up to 14 Images)
	While Flash handles fewer inputs, Pro can mix up to 14 images!
	* Use Case: Style transfer, maintaining character consistency, or complex collages.
	* How: Use the "Composition" tab to upload multiple reference files.

	---

	## 💡 Prompting Masterclass

	To get the best results, follow these professional tips:

	* Be Hyper-Specific: Don't just say "a cat". Say "A photorealistic close-up of a Siamese cat, golden hour lighting, captured with an 85mm lens".
	* Provide Context: Explain the intent. "Create a logo for a high-end minimalist skincare brand".
	* Positive Framing: Instead of "no cars", describe the scene as "an empty, deserted street".
	* Iterate with Chat: Don't expect perfection on turn #1. Use the Chat & Refinement tab to say "Make the lighting warmer" or "Add a wizard hat".

	## ⚡ Performance Tips
	* 4K Generation: Available only on Pro. It costs more but delivers stunning print-quality results.
	* Aspect Ratios: We support everything from 1:1 to 21:9 (Cinematic).
	""")

	if __name__ == "__main__":
	threading.Thread(target=cleanup_old_files, daemon=True).start()
	demo.queue(default_concurrency_limit=20)

	# <--- CORRECTION : Ajout de 'css' et 'theme' ici
	demo.launch(
	theme=gr.themes.Soft(),
	css=css,
	max_threads=40,
	show_error=True,
	server_name="0.0.0.0",
	server_port=7860,
	share=False
	)