Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -24,12 +24,40 @@ from transformers import (
|
|
24 |
from transformers.image_utils import load_image
|
25 |
from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
|
26 |
|
|
|
27 |
MAX_MAX_NEW_TOKENS = 2048
|
28 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
29 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
|
|
30 |
|
31 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
model_id = "prithivMLmods/FastThink-0.5B-Tiny"
|
34 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
35 |
model = AutoModelForCausalLM.from_pretrained(
|
@@ -43,6 +71,10 @@ TTS_VOICES = [
|
|
43 |
"en-US-JennyNeural", # @tts1
|
44 |
"en-US-GuyNeural", # @tts2
|
45 |
]
|
|
|
|
|
|
|
|
|
46 |
MODEL_ID_VL = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
|
47 |
processor = AutoProcessor.from_pretrained(MODEL_ID_VL, trust_remote_code=True)
|
48 |
model_m = Qwen2VLForConditionalGeneration.from_pretrained(
|
@@ -81,7 +113,6 @@ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
|
|
81 |
seed = random.randint(0, MAX_SEED)
|
82 |
return seed
|
83 |
|
84 |
-
MAX_SEED = np.iinfo(np.int32).max
|
85 |
CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "0") == "1"
|
86 |
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "2048"))
|
87 |
USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
|
@@ -89,6 +120,9 @@ ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
|
|
89 |
|
90 |
dtype = torch.float16 if device.type == "cuda" else torch.float32
|
91 |
|
|
|
|
|
|
|
92 |
if torch.cuda.is_available():
|
93 |
# Lightning 5 model
|
94 |
pipe = StableDiffusionXLPipeline.from_pretrained(
|
@@ -174,6 +208,9 @@ def save_image(img: Image.Image) -> str:
|
|
174 |
img.save(unique_name)
|
175 |
return unique_name
|
176 |
|
|
|
|
|
|
|
177 |
@spaces.GPU
|
178 |
def generate(
|
179 |
input_dict: dict,
|
@@ -188,7 +225,7 @@ def generate(
|
|
188 |
files = input_dict.get("files", [])
|
189 |
|
190 |
lower_text = text.lower().strip()
|
191 |
-
#
|
192 |
if (lower_text.startswith("@lightningv5") or
|
193 |
lower_text.startswith("@lightningv4") or
|
194 |
lower_text.startswith("@turbov3")):
|
@@ -234,7 +271,7 @@ def generate(
|
|
234 |
torch.cuda.empty_cache()
|
235 |
|
236 |
selected_pipe = models.get(model_choice, pipe)
|
237 |
-
yield "
|
238 |
images = selected_pipe(**options).images
|
239 |
image_path = save_image(images[0])
|
240 |
yield gr.Image(image_path)
|
@@ -272,7 +309,7 @@ def generate(
|
|
272 |
thread.start()
|
273 |
|
274 |
buffer = ""
|
275 |
-
yield "
|
276 |
for new_text in streamer:
|
277 |
buffer += new_text
|
278 |
buffer = buffer.replace("<|im_end|>", "")
|
@@ -311,6 +348,9 @@ def generate(
|
|
311 |
output_file = asyncio.run(text_to_speech(final_response, voice))
|
312 |
yield gr.Audio(output_file, autoplay=True)
|
313 |
|
|
|
|
|
|
|
314 |
DESCRIPTION = """
|
315 |
# IMAGINEO CHAT ⚡
|
316 |
"""
|
@@ -354,7 +394,6 @@ demo = gr.ChatInterface(
|
|
354 |
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="use the tags @lightningv5 @lightningv4 @turbov3 for image gen !"),
|
355 |
stop_btn="Stop Generation",
|
356 |
multimodal=True,
|
357 |
-
|
358 |
)
|
359 |
|
360 |
if __name__ == "__main__":
|
|
|
24 |
from transformers.image_utils import load_image
|
25 |
from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
|
26 |
|
27 |
+
# Constants
|
28 |
MAX_MAX_NEW_TOKENS = 2048
|
29 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
30 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
31 |
+
MAX_SEED = np.iinfo(np.int32).max
|
32 |
|
33 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
34 |
|
35 |
+
# -----------------------
|
36 |
+
# PROGRESS BAR HELPER
|
37 |
+
# -----------------------
|
38 |
+
def progress_bar_html(label: str) -> str:
|
39 |
+
"""
|
40 |
+
Returns an HTML snippet for a thin progress bar with a label.
|
41 |
+
The progress bar is styled as a dark red animated bar.
|
42 |
+
"""
|
43 |
+
return f'''
|
44 |
+
<div style="display: flex; align-items: center;">
|
45 |
+
<span style="margin-right: 10px; font-size: 14px;">{label}</span>
|
46 |
+
<div style="width: 110px; height: 5px; background-color: #f0f0f0; border-radius: 2px; overflow: hidden;">
|
47 |
+
<div style="width: 100%; height: 100%; background-color: #ff0000; animation: loading 1.5s linear infinite;"></div>
|
48 |
+
</div>
|
49 |
+
</div>
|
50 |
+
<style>
|
51 |
+
@keyframes loading {{
|
52 |
+
0% {{ transform: translateX(-100%); }}
|
53 |
+
100% {{ transform: translateX(100%); }}
|
54 |
+
}}
|
55 |
+
</style>
|
56 |
+
'''
|
57 |
+
|
58 |
+
# -----------------------
|
59 |
+
# TEXT & TTS MODELS
|
60 |
+
# -----------------------
|
61 |
model_id = "prithivMLmods/FastThink-0.5B-Tiny"
|
62 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
63 |
model = AutoModelForCausalLM.from_pretrained(
|
|
|
71 |
"en-US-JennyNeural", # @tts1
|
72 |
"en-US-GuyNeural", # @tts2
|
73 |
]
|
74 |
+
|
75 |
+
# -----------------------
|
76 |
+
# MULTIMODAL (OCR) MODELS
|
77 |
+
# -----------------------
|
78 |
MODEL_ID_VL = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
|
79 |
processor = AutoProcessor.from_pretrained(MODEL_ID_VL, trust_remote_code=True)
|
80 |
model_m = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
|
113 |
seed = random.randint(0, MAX_SEED)
|
114 |
return seed
|
115 |
|
|
|
116 |
CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "0") == "1"
|
117 |
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "2048"))
|
118 |
USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
|
|
|
120 |
|
121 |
dtype = torch.float16 if device.type == "cuda" else torch.float32
|
122 |
|
123 |
+
# -----------------------
|
124 |
+
# STABLE DIFFUSION IMAGE GENERATION MODELS
|
125 |
+
# -----------------------
|
126 |
if torch.cuda.is_available():
|
127 |
# Lightning 5 model
|
128 |
pipe = StableDiffusionXLPipeline.from_pretrained(
|
|
|
208 |
img.save(unique_name)
|
209 |
return unique_name
|
210 |
|
211 |
+
# -----------------------
|
212 |
+
# MAIN GENERATION FUNCTION
|
213 |
+
# -----------------------
|
214 |
@spaces.GPU
|
215 |
def generate(
|
216 |
input_dict: dict,
|
|
|
225 |
files = input_dict.get("files", [])
|
226 |
|
227 |
lower_text = text.lower().strip()
|
228 |
+
# If the prompt is an image generation command (using model flags)
|
229 |
if (lower_text.startswith("@lightningv5") or
|
230 |
lower_text.startswith("@lightningv4") or
|
231 |
lower_text.startswith("@turbov3")):
|
|
|
271 |
torch.cuda.empty_cache()
|
272 |
|
273 |
selected_pipe = models.get(model_choice, pipe)
|
274 |
+
yield progress_bar_html("Processing Image Generation")
|
275 |
images = selected_pipe(**options).images
|
276 |
image_path = save_image(images[0])
|
277 |
yield gr.Image(image_path)
|
|
|
309 |
thread.start()
|
310 |
|
311 |
buffer = ""
|
312 |
+
yield progress_bar_html("Processing with Qwen2VL Ocr")
|
313 |
for new_text in streamer:
|
314 |
buffer += new_text
|
315 |
buffer = buffer.replace("<|im_end|>", "")
|
|
|
348 |
output_file = asyncio.run(text_to_speech(final_response, voice))
|
349 |
yield gr.Audio(output_file, autoplay=True)
|
350 |
|
351 |
+
# -----------------------
|
352 |
+
# GRADIO INTERFACE
|
353 |
+
# -----------------------
|
354 |
DESCRIPTION = """
|
355 |
# IMAGINEO CHAT ⚡
|
356 |
"""
|
|
|
394 |
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="use the tags @lightningv5 @lightningv4 @turbov3 for image gen !"),
|
395 |
stop_btn="Stop Generation",
|
396 |
multimodal=True,
|
|
|
397 |
)
|
398 |
|
399 |
if __name__ == "__main__":
|