Spaces:

hasanbasbunar
/

Nano-Vision-Studio

Running

App Files Files Community

hasanbasbunar commited on Nov 21

Commit

8285937

verified ·

1 Parent(s): 5af68a8

Create app.py

Browse files

Files changed (1) hide show

app.py +410 -0

app.py ADDED Viewed

	@@ -0,0 +1,410 @@

+import gradio as gr
+from google import genai
+from google.genai import types
+from PIL import Image
+import os
+import io
+import uuid
+from dotenv import load_dotenv
+load_dotenv()
+# --- CONFIGURATION ---
+api_key = os.environ.get("GOOGLE_API_KEY")
+client = None
+if api_key:
+    client = genai.Client(api_key=api_key)
+MODELS = {
+    "🧠 Gemini 3 Pro Preview (Recommended)": "gemini-3-pro-image-preview",
+    "⚡ Gemini 2.5 Flash (Fast)": "gemini-2.5-flash-image"
+}
+RATIOS = ["1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"]
+RESOLUTIONS = ["1K", "2K", "4K"] # Pro only
+# Temporary folder to store chat images for history persistence
+TEMP_CHAT_DIR = "temp_chat_images"
+os.makedirs(TEMP_CHAT_DIR, exist_ok=True)
+# --- UTILS ---
+def get_client():
+    if not client:
+        raise gr.Error("API Key missing. Please set GOOGLE_API_KEY environment variable.")
+    return client
+def safe_process_image(part):
+    """Converts raw data to PIL Image."""
+    try:
+        if part.inline_data and hasattr(part.inline_data, 'data'):
+            return Image.open(io.BytesIO(part.inline_data.data))
+        if hasattr(part, 'as_image'):
+            img = part.as_image()
+            if hasattr(img, 'image'): return img.image
+            return img
+        return None
+    except Exception as e:
+        print(f"⚠️ Image conversion error: {e}")
+        return None
+def process_response(response):
+    """Separates final results from thought process (text & images)."""
+    final_imgs, final_txt = [], ""
+    thought_imgs, thought_txt = [], ""
+    if not response or not response.parts:
+        return final_imgs, final_txt, thought_imgs, thought_txt
+    print(f"\n--- RECEIVED ({len(response.parts)} parts) ---")
+    for i, part in enumerate(response.parts):
+        is_thought = getattr(part, 'thought', False)
+        kind = "IMAGE" if part.inline_data else "TEXT"
+        preview = "..." if part.inline_data else (part.text[:30] + "..." if part.text else "")
+        print(f"Part {i+1}: {kind} | Thought={is_thought} | {preview}")
+        if is_thought:
+            if part.text: thought_txt += part.text + "\n"
+            if part.inline_data:
+                img = safe_process_image(part)
+                if img: thought_imgs.append(img)
+        else:
+            if part.text: final_txt += part.text + "\n"
+            if part.inline_data:
+                img = safe_process_image(part)
+                if img: final_imgs.append(img)
+    return final_imgs, final_txt, thought_imgs, thought_txt
+# --- BACKEND FUNCTIONS ---
+def update_api_key(new_key):
+    """Updates the global client with the user's API key."""
+    global client
+    if not new_key:
+        return "⚠️ Please enter a valid API Key."
+    try:
+        # Attempt to initialize the client
+        client = genai.Client(api_key=new_key)
+        return "✅ API Key configured successfully! You can now use the application."
+    except Exception as e:
+        return f"❌ Configuration Error: {str(e)}"
+def generate_studio(prompt, model_ui, ratio, resolution, grounding):
+    """Standard T2I Generation"""
+    cli = get_client()
+    model_name = MODELS[model_ui]
+    img_conf = {"aspect_ratio": ratio}
+    gen_conf = {"response_modalities": ["TEXT", "IMAGE"]}
+    if "gemini-3" in model_name:
+        img_conf["image_size"] = resolution
+        if grounding:
+            gen_conf["tools"] = [{"google_search": {}}]
+    gen_conf["image_config"] = types.ImageConfig(**img_conf)
+    try:
+        print(f"🚀 Sending request [T2I]...")
+        response = cli.models.generate_content(
+            model=model_name,
+            contents=[prompt],
+            config=types.GenerateContentConfig(**gen_conf)
+        )
+        return process_response(response)
+    except Exception as e:
+        raise gr.Error(f"API Error: {str(e)}")
+def generate_composition(prompt, files, model_ui, ratio, resolution):
+    """Composition I2I"""
+    cli = get_client()
+    model_name = MODELS[model_ui]
+    if not files: raise gr.Error("No input images provided.")
+    contents = [prompt]
+    for p in files:
+        try:
+            contents.append(Image.open(p))
+        except: pass
+    img_conf = {"aspect_ratio": ratio}
+    gen_conf = {"response_modalities": ["TEXT", "IMAGE"]}
+    if "gemini-3" in model_name:
+        img_conf["image_size"] = resolution
+    gen_conf["image_config"] = types.ImageConfig(**img_conf)
+    try:
+        print(f"🚀 Sending request [I2I]")
+        response = cli.models.generate_content(
+            model=model_name,
+            contents=contents,
+            config=types.GenerateContentConfig(**gen_conf)
+        )
+        f_imgs, f_txt, t_imgs, t_txt = process_response(response)
+        full_text = f_txt
+        if t_txt: full_text += f"\n\n--- 🧠 MODEL REASONING ---\n{t_txt}"
+        return f_imgs, full_text
+    except Exception as e:
+        raise gr.Error(f"Error: {str(e)}")
+# --- CHAT LOGIC ---
+def init_chat_session(model_ui, grounding):
+    cli = get_client()
+    model_name = MODELS[model_ui]
+    tools = None
+    if grounding and "gemini-3" in model_name:
+        tools = [{"google_search": {}}]
+    chat = cli.chats.create(
+        model=model_name,
+        config=types.GenerateContentConfig(
+            response_modalities=['TEXT', 'IMAGE'],
+            tools=tools
+        )
+    )
+    return chat
+def chat_respond(message, history, chat_state, img_input, model_ui, grounding):
+    """Iterative chat management with image history"""
+    if chat_state is None:
+        chat_state = init_chat_session(model_ui, grounding)
+    # --- 1. User message prep ---
+    contents = [message]
+    user_display_text = message
+    if img_input:
+        contents.append(Image.open(img_input))
+        user_display_text += "\n\n🖼️ *(Image attached)*"
+    user_message_obj = {"role": "user", "content": user_display_text}
+    try:
+        # --- 2. API Call ---
+        response = chat_state.send_message(contents)
+        f_imgs, f_txt, t_imgs, t_txt = process_response(response)
+        # --- 3. Bot message construction ---
+        bot_messages = []
+        # A. Thoughts (Optional)
+        if t_txt or t_imgs:
+            thought_md = "🧠 **Model Thought Process:**\n"
+            if t_txt: thought_md += f"> {t_txt}\n"
+            if t_imgs: thought_md += f"*( + {len(t_imgs)} draft image(s) not displayed)*\n"
+            thought_md += "---\n"
+            bot_messages.append({"role": "assistant", "content": thought_md})
+        # B. Final Text
+        if f_txt:
+            bot_messages.append({"role": "assistant", "content": f_txt})
+        # C. Final Images
+        if f_imgs:
+            for i, img in enumerate(f_imgs):
+                unique_filename = f"chat_{uuid.uuid4()}_{i}.png"
+                file_path = os.path.join(TEMP_CHAT_DIR, unique_filename)
+                img.save(file_path)
+                bot_messages.append({"role": "assistant", "content": (file_path, "Generated Image")})
+        # D. Empty response handling
+        if not f_txt and not f_imgs and not t_txt:
+             bot_messages.append({"role": "assistant", "content": "⚠️ *The model returned no text or image for this request.*"})
+        # --- 4. History Update ---
+        new_history = history + [user_message_obj] + bot_messages
+        return "", new_history, chat_state, f_imgs
+    except Exception as e:
+        err_msg = f"❌ Error: {str(e)}"
+        bot_err_obj = {"role": "assistant", "content": err_msg}
+        return "", history + [user_message_obj, bot_err_obj], chat_state, []
+def clear_chat(model_ui, grounding):
+    new_chat = init_chat_session(model_ui, grounding)
+    return [], new_chat, []
+# --- GRADIO INTERFACE ---
+css = """
+.container { max-width: 1200px; margin: auto; }
+h1 { text-align: center; color: #4F46E5; font-size: 2.5em; }
+/* Prevent chat images from being too large */
+.image-container img { max-height: 400px; width: auto; }
+"""
+with gr.Blocks(theme=gr.themes.Soft(), css=css, title="Nano Vision Studio") as demo:
+    gr.Markdown("# Nano 🍌 Vision Studio")
+    gr.Markdown("### The Ultimate Interface: 4K Generation, Grounding, Multi-Image Composition & Iterative Chat")
+    # Chat Session State
+    chat_state = gr.State(None)
+    with gr.Tabs():
+        # --- TAB 0 : API CONFIGURATION ---
+        with gr.TabItem("🔑 API Settings"):
+            gr.Markdown("### ⚙️ API Configuration")
+            gr.Markdown("""
+            To use **Gemini Ultimate Studio**, you must provide your own Google Gemini API Key.
+            If you don't have one, you can get it from [Google AI Studio](https://aistudio.google.com/).
+            """)
+            with gr.Row():
+                with gr.Column(scale=3):
+                    api_input = gr.Textbox(
+                        label="Google Gemini API Key",
+                        placeholder="Paste your API key here (starts with AIza...)",
+                        type="password", # Masks the key characters
+                        lines=1
+                    )
+                with gr.Column(scale=1):
+                    api_btn = gr.Button("Save & Initialize 💾", variant="primary")
+            # Status message area
+            api_status = gr.Markdown()
+            # Event listener
+            api_btn.click(
+                update_api_key,
+                inputs=[api_input],
+                outputs=[api_status]
+            )
+        # --- TAB 1 : CREATION STUDIO ---
+        with gr.TabItem("🎨 Creation Studio"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    t1_prompt = gr.Textbox(label="Prompt", lines=4, placeholder="Describe the scene in detail (lighting, style, camera angle)...")
+                    with gr.Group():
+                        gr.Markdown("### ⚙️ Settings")
+                        t1_model = gr.Dropdown(choices=list(MODELS.keys()), value="🧠 Gemini 3 Pro Preview (Recommended)", label="Model")
+                        with gr.Row():
+                            t1_ratio = gr.Dropdown(RATIOS, value="16:9", label="Aspect Ratio")
+                            t1_res = gr.Dropdown(RESOLUTIONS, value="2K", label="Resolution (Pro only)")
+                        t1_grounding = gr.Checkbox(label="Google Search (Grounding)", info="Use real-time data (Weather, Stocks, News)")
+                    t1_btn = gr.Button("Generate ✨", variant="primary", size="lg")
+                with gr.Column(scale=2):
+                    gr.Markdown("### 🖼️ Result")
+                    t1_gallery = gr.Gallery(label="Final Images", columns=2, height="auto")
+                    t1_text = gr.Markdown(label="Generated Text")
+                    with gr.Accordion("🧠 Thought Process (Automatic)", open=False):
+                        t1_thought_imgs = gr.Gallery(label="Visual Drafts", columns=4, height=150)
+                        t1_thought_txt = gr.Textbox(label="Thought Stream", interactive=False, lines=4)
+            t1_btn.click(
+                generate_studio,
+                inputs=[t1_prompt, t1_model, t1_ratio, t1_res, t1_grounding],
+                outputs=[t1_gallery, t1_text, t1_thought_imgs, t1_thought_txt]
+            )
+        # --- TAB 2 : COMPOSITION ---
+        with gr.TabItem("🛠️ Composition (up to 14 Images)"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    t2_files = gr.File(label="Reference Images (Max 14)", file_count="multiple", type="filepath")
+                    t2_prompt = gr.Textbox(label="Instructions", placeholder="e.g., Combine these elements, transfer the style, keep the character consistent...", lines=3)
+                    with gr.Accordion("Advanced Settings", open=False):
+                        t2_model = gr.Dropdown(list(MODELS.keys()), value="🧠 Gemini 3 Pro Preview (Recommended)", label="Model")
+                        with gr.Row():
+                            t2_ratio = gr.Dropdown(RATIOS, value="1:1", label="Aspect Ratio")
+                            t2_res = gr.Dropdown(RESOLUTIONS, value="1K", label="Output Resolution")
+                    t2_btn = gr.Button("Run", variant="primary")
+                with gr.Column(scale=2):
+                    t2_gallery = gr.Gallery(label="Result", columns=1)
+                    t2_text = gr.Markdown()
+            t2_btn.click(
+                generate_composition,
+                inputs=[t2_prompt, t2_files, t2_model, t2_ratio, t2_res],
+                outputs=[t2_gallery, t2_text]
+            )
+        # --- TAB 3 : ITERATIVE CHAT ---
+        with gr.TabItem("💬 Chat & Refinement"):
+            gr.Markdown("<center>Conversational Mode: Refine your images step-by-step. Generated images appear in the history.</center>")
+            with gr.Row():
+                with gr.Column(scale=2):
+                    # Main chatbot that will display text AND images interleaved
+                    chat_history = gr.Chatbot(label="Session History", height=600, type="messages", bubble_full_width=False)
+                    with gr.Row():
+                        chat_input = gr.Textbox(label="Your Message", placeholder="e.g., 'Generate a portrait', then 'Add glasses'...", scale=4)
+                        chat_img = gr.Image(label="Input Image (Optional)", type="filepath", height=100, scale=1, show_download_button=False, container=False)
+                    with gr.Row():
+                        chat_btn = gr.Button("Send", variant="primary")
+                        clear_btn = gr.Button("🗑️ New Session")
+                    with gr.Accordion("Chat Options", open=False):
+                        c_model = gr.Dropdown(list(MODELS.keys()), value="🧠 Gemini 3 Pro Preview (Recommended)", label="Model")
+                        c_grounding = gr.Checkbox(label="Grounding (Search)")
+                with gr.Column(scale=1):
+                    gr.Markdown("### 🔍 Last Visual")
+                    # Zoom gallery to see the last image large on the side
+                    chat_gallery_zoom = gr.Gallery(label="Zoom", columns=1, height="auto")
+            chat_btn.click(
+                chat_respond,
+                inputs=[chat_input, chat_history, chat_state, chat_img, c_model, c_grounding],
+                outputs=[chat_input, chat_history, chat_state, chat_gallery_zoom]
+            )
+            clear_btn.click(
+                clear_chat,
+                inputs=[c_model, c_grounding],
+                outputs=[chat_history, chat_state, chat_gallery_zoom]
+            )
+        # --- TAB 4 : GUIDE ---
+        with gr.TabItem("📚 Guide & Best Practices"):
+            gr.Markdown("""
+            ### 🍌 Quick Guide
+            1. **Creation Studio**: Standard "one-shot" generation. The "Pro" model autonomously decides when to use its "Thinking" process (drafting) for complex prompts.
+            2. **Composition**: Drag and drop up to **14 images**! Ideal for character consistency (e.g., uploading 5 photos of the same person), style transfer, or complex montages.
+            3. **Iterative Chat**: **The best mode for refining an image.**
+               - Start with "Generate a apple".
+               - Then simply ask "Make it green".
+               - The model maintains context and history.
+            ### 💡 Pro Tips (from Documentation)
+            * **Be Hyper-Specific**: Instead of "fantasy armor", say "ornate elven plate armor, etched with silver leaf patterns".
+            * **Provide Context**: Explain the *purpose* (e.g., "Create a logo for a minimalist brand").
+            * **Iterate**: Don't expect perfection instantly. Use the Chat tab to refine.
+            * **Step-by-Step**: For complex scenes, break instructions down: "First, background... Then, foreground...".
+            * **Semantic Negatives**: Instead of "no cars", say "an empty, deserted street".
+            * **Camera Control**: Use terms like "wide-angle", "macro shot", "low-angle perspective".
+            ### Key Features
+            - **Grounding**: Uses Google Search to generate images based on real-time data (e.g., "Current weather in Tokyo").
+            - **Resolution**: Use the "Pro" model to unlock 4K output.
+            """)
+if __name__ == "__main__":
+    demo.launch()