hasanbasbunar commited on
Commit
8285937
Β·
verified Β·
1 Parent(s): 5af68a8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +410 -0
app.py ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from google import genai
3
+ from google.genai import types
4
+ from PIL import Image
5
+ import os
6
+ import io
7
+ import uuid
8
+ from dotenv import load_dotenv
9
+ load_dotenv()
10
+
11
+ # --- CONFIGURATION ---
12
+ api_key = os.environ.get("GOOGLE_API_KEY")
13
+
14
+ client = None
15
+ if api_key:
16
+ client = genai.Client(api_key=api_key)
17
+
18
+ MODELS = {
19
+ "🧠 Gemini 3 Pro Preview (Recommended)": "gemini-3-pro-image-preview",
20
+ "⚑ Gemini 2.5 Flash (Fast)": "gemini-2.5-flash-image"
21
+ }
22
+
23
+ RATIOS = ["1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"]
24
+ RESOLUTIONS = ["1K", "2K", "4K"] # Pro only
25
+
26
+ # Temporary folder to store chat images for history persistence
27
+ TEMP_CHAT_DIR = "temp_chat_images"
28
+ os.makedirs(TEMP_CHAT_DIR, exist_ok=True)
29
+
30
+ # --- UTILS ---
31
+
32
+ def get_client():
33
+ if not client:
34
+ raise gr.Error("API Key missing. Please set GOOGLE_API_KEY environment variable.")
35
+ return client
36
+
37
+ def safe_process_image(part):
38
+ """Converts raw data to PIL Image."""
39
+ try:
40
+ if part.inline_data and hasattr(part.inline_data, 'data'):
41
+ return Image.open(io.BytesIO(part.inline_data.data))
42
+ if hasattr(part, 'as_image'):
43
+ img = part.as_image()
44
+ if hasattr(img, 'image'): return img.image
45
+ return img
46
+ return None
47
+ except Exception as e:
48
+ print(f"⚠️ Image conversion error: {e}")
49
+ return None
50
+
51
+ def process_response(response):
52
+ """Separates final results from thought process (text & images)."""
53
+ final_imgs, final_txt = [], ""
54
+ thought_imgs, thought_txt = [], ""
55
+
56
+ if not response or not response.parts:
57
+ return final_imgs, final_txt, thought_imgs, thought_txt
58
+
59
+ print(f"\n--- RECEIVED ({len(response.parts)} parts) ---")
60
+ for i, part in enumerate(response.parts):
61
+ is_thought = getattr(part, 'thought', False)
62
+
63
+ kind = "IMAGE" if part.inline_data else "TEXT"
64
+ preview = "..." if part.inline_data else (part.text[:30] + "..." if part.text else "")
65
+ print(f"Part {i+1}: {kind} | Thought={is_thought} | {preview}")
66
+
67
+ if is_thought:
68
+ if part.text: thought_txt += part.text + "\n"
69
+ if part.inline_data:
70
+ img = safe_process_image(part)
71
+ if img: thought_imgs.append(img)
72
+ else:
73
+ if part.text: final_txt += part.text + "\n"
74
+ if part.inline_data:
75
+ img = safe_process_image(part)
76
+ if img: final_imgs.append(img)
77
+
78
+ return final_imgs, final_txt, thought_imgs, thought_txt
79
+
80
+ # --- BACKEND FUNCTIONS ---
81
+
82
+ def update_api_key(new_key):
83
+ """Updates the global client with the user's API key."""
84
+ global client
85
+ if not new_key:
86
+ return "⚠️ Please enter a valid API Key."
87
+
88
+ try:
89
+ # Attempt to initialize the client
90
+ client = genai.Client(api_key=new_key)
91
+ return "βœ… API Key configured successfully! You can now use the application."
92
+ except Exception as e:
93
+ return f"❌ Configuration Error: {str(e)}"
94
+
95
+ def generate_studio(prompt, model_ui, ratio, resolution, grounding):
96
+ """Standard T2I Generation"""
97
+ cli = get_client()
98
+ model_name = MODELS[model_ui]
99
+
100
+ img_conf = {"aspect_ratio": ratio}
101
+ gen_conf = {"response_modalities": ["TEXT", "IMAGE"]}
102
+
103
+ if "gemini-3" in model_name:
104
+ img_conf["image_size"] = resolution
105
+ if grounding:
106
+ gen_conf["tools"] = [{"google_search": {}}]
107
+
108
+ gen_conf["image_config"] = types.ImageConfig(**img_conf)
109
+
110
+ try:
111
+ print(f"πŸš€ Sending request [T2I]...")
112
+ response = cli.models.generate_content(
113
+ model=model_name,
114
+ contents=[prompt],
115
+ config=types.GenerateContentConfig(**gen_conf)
116
+ )
117
+ return process_response(response)
118
+ except Exception as e:
119
+ raise gr.Error(f"API Error: {str(e)}")
120
+
121
+ def generate_composition(prompt, files, model_ui, ratio, resolution):
122
+ """Composition I2I"""
123
+ cli = get_client()
124
+ model_name = MODELS[model_ui]
125
+
126
+ if not files: raise gr.Error("No input images provided.")
127
+
128
+ contents = [prompt]
129
+ for p in files:
130
+ try:
131
+ contents.append(Image.open(p))
132
+ except: pass
133
+
134
+ img_conf = {"aspect_ratio": ratio}
135
+ gen_conf = {"response_modalities": ["TEXT", "IMAGE"]}
136
+
137
+ if "gemini-3" in model_name:
138
+ img_conf["image_size"] = resolution
139
+
140
+ gen_conf["image_config"] = types.ImageConfig(**img_conf)
141
+
142
+ try:
143
+ print(f"πŸš€ Sending request [I2I]")
144
+ response = cli.models.generate_content(
145
+ model=model_name,
146
+ contents=contents,
147
+ config=types.GenerateContentConfig(**gen_conf)
148
+ )
149
+ f_imgs, f_txt, t_imgs, t_txt = process_response(response)
150
+
151
+ full_text = f_txt
152
+ if t_txt: full_text += f"\n\n--- 🧠 MODEL REASONING ---\n{t_txt}"
153
+
154
+ return f_imgs, full_text
155
+ except Exception as e:
156
+ raise gr.Error(f"Error: {str(e)}")
157
+
158
+ # --- CHAT LOGIC ---
159
+
160
+ def init_chat_session(model_ui, grounding):
161
+ cli = get_client()
162
+ model_name = MODELS[model_ui]
163
+
164
+ tools = None
165
+ if grounding and "gemini-3" in model_name:
166
+ tools = [{"google_search": {}}]
167
+
168
+ chat = cli.chats.create(
169
+ model=model_name,
170
+ config=types.GenerateContentConfig(
171
+ response_modalities=['TEXT', 'IMAGE'],
172
+ tools=tools
173
+ )
174
+ )
175
+ return chat
176
+
177
+ def chat_respond(message, history, chat_state, img_input, model_ui, grounding):
178
+ """Iterative chat management with image history"""
179
+ if chat_state is None:
180
+ chat_state = init_chat_session(model_ui, grounding)
181
+
182
+ # --- 1. User message prep ---
183
+ contents = [message]
184
+ user_display_text = message
185
+
186
+ if img_input:
187
+ contents.append(Image.open(img_input))
188
+ user_display_text += "\n\nπŸ–ΌοΈ *(Image attached)*"
189
+
190
+ user_message_obj = {"role": "user", "content": user_display_text}
191
+
192
+ try:
193
+ # --- 2. API Call ---
194
+ response = chat_state.send_message(contents)
195
+ f_imgs, f_txt, t_imgs, t_txt = process_response(response)
196
+
197
+ # --- 3. Bot message construction ---
198
+ bot_messages = []
199
+
200
+ # A. Thoughts (Optional)
201
+ if t_txt or t_imgs:
202
+ thought_md = "🧠 **Model Thought Process:**\n"
203
+ if t_txt: thought_md += f"> {t_txt}\n"
204
+ if t_imgs: thought_md += f"*( + {len(t_imgs)} draft image(s) not displayed)*\n"
205
+ thought_md += "---\n"
206
+ bot_messages.append({"role": "assistant", "content": thought_md})
207
+
208
+ # B. Final Text
209
+ if f_txt:
210
+ bot_messages.append({"role": "assistant", "content": f_txt})
211
+
212
+ # C. Final Images
213
+ if f_imgs:
214
+ for i, img in enumerate(f_imgs):
215
+ unique_filename = f"chat_{uuid.uuid4()}_{i}.png"
216
+ file_path = os.path.join(TEMP_CHAT_DIR, unique_filename)
217
+ img.save(file_path)
218
+
219
+ bot_messages.append({"role": "assistant", "content": (file_path, "Generated Image")})
220
+
221
+ # D. Empty response handling
222
+ if not f_txt and not f_imgs and not t_txt:
223
+ bot_messages.append({"role": "assistant", "content": "⚠️ *The model returned no text or image for this request.*"})
224
+
225
+ # --- 4. History Update ---
226
+ new_history = history + [user_message_obj] + bot_messages
227
+
228
+ return "", new_history, chat_state, f_imgs
229
+
230
+ except Exception as e:
231
+ err_msg = f"❌ Error: {str(e)}"
232
+ bot_err_obj = {"role": "assistant", "content": err_msg}
233
+ return "", history + [user_message_obj, bot_err_obj], chat_state, []
234
+
235
+ def clear_chat(model_ui, grounding):
236
+ new_chat = init_chat_session(model_ui, grounding)
237
+ return [], new_chat, []
238
+
239
+ # --- GRADIO INTERFACE ---
240
+
241
+ css = """
242
+ .container { max-width: 1200px; margin: auto; }
243
+ h1 { text-align: center; color: #4F46E5; font-size: 2.5em; }
244
+ /* Prevent chat images from being too large */
245
+ .image-container img { max-height: 400px; width: auto; }
246
+ """
247
+
248
+ with gr.Blocks(theme=gr.themes.Soft(), css=css, title="Nano Vision Studio") as demo:
249
+
250
+ gr.Markdown("# Nano 🍌 Vision Studio")
251
+ gr.Markdown("### The Ultimate Interface: 4K Generation, Grounding, Multi-Image Composition & Iterative Chat")
252
+
253
+ # Chat Session State
254
+ chat_state = gr.State(None)
255
+
256
+ with gr.Tabs():
257
+
258
+ # --- TAB 0 : API CONFIGURATION ---
259
+ with gr.TabItem("πŸ”‘ API Settings"):
260
+ gr.Markdown("### βš™οΈ API Configuration")
261
+ gr.Markdown("""
262
+ To use **Gemini Ultimate Studio**, you must provide your own Google Gemini API Key.
263
+ If you don't have one, you can get it from [Google AI Studio](https://aistudio.google.com/).
264
+ """)
265
+
266
+ with gr.Row():
267
+ with gr.Column(scale=3):
268
+ api_input = gr.Textbox(
269
+ label="Google Gemini API Key",
270
+ placeholder="Paste your API key here (starts with AIza...)",
271
+ type="password", # Masks the key characters
272
+ lines=1
273
+ )
274
+ with gr.Column(scale=1):
275
+ api_btn = gr.Button("Save & Initialize πŸ’Ύ", variant="primary")
276
+
277
+ # Status message area
278
+ api_status = gr.Markdown()
279
+
280
+ # Event listener
281
+ api_btn.click(
282
+ update_api_key,
283
+ inputs=[api_input],
284
+ outputs=[api_status]
285
+ )
286
+
287
+ # --- TAB 1 : CREATION STUDIO ---
288
+ with gr.TabItem("🎨 Creation Studio"):
289
+ with gr.Row():
290
+ with gr.Column(scale=1):
291
+ t1_prompt = gr.Textbox(label="Prompt", lines=4, placeholder="Describe the scene in detail (lighting, style, camera angle)...")
292
+
293
+ with gr.Group():
294
+ gr.Markdown("### βš™οΈ Settings")
295
+ t1_model = gr.Dropdown(choices=list(MODELS.keys()), value="🧠 Gemini 3 Pro Preview (Recommended)", label="Model")
296
+
297
+ with gr.Row():
298
+ t1_ratio = gr.Dropdown(RATIOS, value="16:9", label="Aspect Ratio")
299
+ t1_res = gr.Dropdown(RESOLUTIONS, value="2K", label="Resolution (Pro only)")
300
+
301
+ t1_grounding = gr.Checkbox(label="Google Search (Grounding)", info="Use real-time data (Weather, Stocks, News)")
302
+
303
+ t1_btn = gr.Button("Generate ✨", variant="primary", size="lg")
304
+
305
+ with gr.Column(scale=2):
306
+ gr.Markdown("### πŸ–ΌοΈ Result")
307
+ t1_gallery = gr.Gallery(label="Final Images", columns=2, height="auto")
308
+ t1_text = gr.Markdown(label="Generated Text")
309
+
310
+ with gr.Accordion("🧠 Thought Process (Automatic)", open=False):
311
+ t1_thought_imgs = gr.Gallery(label="Visual Drafts", columns=4, height=150)
312
+ t1_thought_txt = gr.Textbox(label="Thought Stream", interactive=False, lines=4)
313
+
314
+ t1_btn.click(
315
+ generate_studio,
316
+ inputs=[t1_prompt, t1_model, t1_ratio, t1_res, t1_grounding],
317
+ outputs=[t1_gallery, t1_text, t1_thought_imgs, t1_thought_txt]
318
+ )
319
+
320
+ # --- TAB 2 : COMPOSITION ---
321
+ with gr.TabItem("πŸ› οΈ Composition (up to 14 Images)"):
322
+ with gr.Row():
323
+ with gr.Column(scale=1):
324
+ t2_files = gr.File(label="Reference Images (Max 14)", file_count="multiple", type="filepath")
325
+ t2_prompt = gr.Textbox(label="Instructions", placeholder="e.g., Combine these elements, transfer the style, keep the character consistent...", lines=3)
326
+
327
+ with gr.Accordion("Advanced Settings", open=False):
328
+ t2_model = gr.Dropdown(list(MODELS.keys()), value="🧠 Gemini 3 Pro Preview (Recommended)", label="Model")
329
+ with gr.Row():
330
+ t2_ratio = gr.Dropdown(RATIOS, value="1:1", label="Aspect Ratio")
331
+ t2_res = gr.Dropdown(RESOLUTIONS, value="1K", label="Output Resolution")
332
+
333
+ t2_btn = gr.Button("Run", variant="primary")
334
+
335
+ with gr.Column(scale=2):
336
+ t2_gallery = gr.Gallery(label="Result", columns=1)
337
+ t2_text = gr.Markdown()
338
+
339
+ t2_btn.click(
340
+ generate_composition,
341
+ inputs=[t2_prompt, t2_files, t2_model, t2_ratio, t2_res],
342
+ outputs=[t2_gallery, t2_text]
343
+ )
344
+
345
+ # --- TAB 3 : ITERATIVE CHAT ---
346
+ with gr.TabItem("πŸ’¬ Chat & Refinement"):
347
+ gr.Markdown("<center>Conversational Mode: Refine your images step-by-step. Generated images appear in the history.</center>")
348
+
349
+ with gr.Row():
350
+ with gr.Column(scale=2):
351
+ # Main chatbot that will display text AND images interleaved
352
+ chat_history = gr.Chatbot(label="Session History", height=600, type="messages", bubble_full_width=False)
353
+
354
+ with gr.Row():
355
+ chat_input = gr.Textbox(label="Your Message", placeholder="e.g., 'Generate a portrait', then 'Add glasses'...", scale=4)
356
+ chat_img = gr.Image(label="Input Image (Optional)", type="filepath", height=100, scale=1, show_download_button=False, container=False)
357
+
358
+ with gr.Row():
359
+ chat_btn = gr.Button("Send", variant="primary")
360
+ clear_btn = gr.Button("πŸ—‘οΈ New Session")
361
+
362
+ with gr.Accordion("Chat Options", open=False):
363
+ c_model = gr.Dropdown(list(MODELS.keys()), value="🧠 Gemini 3 Pro Preview (Recommended)", label="Model")
364
+ c_grounding = gr.Checkbox(label="Grounding (Search)")
365
+
366
+ with gr.Column(scale=1):
367
+ gr.Markdown("### πŸ” Last Visual")
368
+ # Zoom gallery to see the last image large on the side
369
+ chat_gallery_zoom = gr.Gallery(label="Zoom", columns=1, height="auto")
370
+
371
+ chat_btn.click(
372
+ chat_respond,
373
+ inputs=[chat_input, chat_history, chat_state, chat_img, c_model, c_grounding],
374
+ outputs=[chat_input, chat_history, chat_state, chat_gallery_zoom]
375
+ )
376
+
377
+ clear_btn.click(
378
+ clear_chat,
379
+ inputs=[c_model, c_grounding],
380
+ outputs=[chat_history, chat_state, chat_gallery_zoom]
381
+ )
382
+
383
+ # --- TAB 4 : GUIDE ---
384
+ with gr.TabItem("πŸ“š Guide & Best Practices"):
385
+ gr.Markdown("""
386
+ ### 🍌 Quick Guide
387
+
388
+ 1. **Creation Studio**: Standard "one-shot" generation. The "Pro" model autonomously decides when to use its "Thinking" process (drafting) for complex prompts.
389
+ 2. **Composition**: Drag and drop up to **14 images**! Ideal for character consistency (e.g., uploading 5 photos of the same person), style transfer, or complex montages.
390
+ 3. **Iterative Chat**: **The best mode for refining an image.**
391
+ - Start with "Generate a apple".
392
+ - Then simply ask "Make it green".
393
+ - The model maintains context and history.
394
+
395
+ ### πŸ’‘ Pro Tips (from Documentation)
396
+
397
+ * **Be Hyper-Specific**: Instead of "fantasy armor", say "ornate elven plate armor, etched with silver leaf patterns".
398
+ * **Provide Context**: Explain the *purpose* (e.g., "Create a logo for a minimalist brand").
399
+ * **Iterate**: Don't expect perfection instantly. Use the Chat tab to refine.
400
+ * **Step-by-Step**: For complex scenes, break instructions down: "First, background... Then, foreground...".
401
+ * **Semantic Negatives**: Instead of "no cars", say "an empty, deserted street".
402
+ * **Camera Control**: Use terms like "wide-angle", "macro shot", "low-angle perspective".
403
+
404
+ ### Key Features
405
+ - **Grounding**: Uses Google Search to generate images based on real-time data (e.g., "Current weather in Tokyo").
406
+ - **Resolution**: Use the "Pro" model to unlock 4K output.
407
+ """)
408
+
409
+ if __name__ == "__main__":
410
+ demo.launch()