Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse filesreason1 demo draft
app.py
CHANGED
@@ -1,154 +1,286 @@
|
|
1 |
import gradio as gr
|
2 |
-
import numpy as np
|
3 |
-
import random
|
4 |
-
|
5 |
-
# import spaces #[uncomment to use ZeroGPU]
|
6 |
-
from diffusers import DiffusionPipeline
|
7 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
MAX_SEED = np.iinfo(np.int32).max
|
21 |
-
MAX_IMAGE_SIZE = 1024
|
22 |
-
|
23 |
-
|
24 |
-
# @spaces.GPU #[uncomment to use ZeroGPU]
|
25 |
-
def infer(
|
26 |
-
prompt,
|
27 |
-
negative_prompt,
|
28 |
-
seed,
|
29 |
-
randomize_seed,
|
30 |
-
width,
|
31 |
-
height,
|
32 |
-
guidance_scale,
|
33 |
-
num_inference_steps,
|
34 |
-
progress=gr.Progress(track_tqdm=True),
|
35 |
-
):
|
36 |
-
if randomize_seed:
|
37 |
-
seed = random.randint(0, MAX_SEED)
|
38 |
-
|
39 |
-
generator = torch.Generator().manual_seed(seed)
|
40 |
-
|
41 |
-
image = pipe(
|
42 |
-
prompt=prompt,
|
43 |
-
negative_prompt=negative_prompt,
|
44 |
-
guidance_scale=guidance_scale,
|
45 |
-
num_inference_steps=num_inference_steps,
|
46 |
-
width=width,
|
47 |
-
height=height,
|
48 |
-
generator=generator,
|
49 |
-
).images[0]
|
50 |
-
|
51 |
-
return image, seed
|
52 |
-
|
53 |
-
|
54 |
-
examples = [
|
55 |
-
"Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
|
56 |
-
"An astronaut riding a green horse",
|
57 |
-
"A delicious ceviche cheesecake slice",
|
58 |
-
]
|
59 |
-
|
60 |
-
css = """
|
61 |
-
#col-container {
|
62 |
-
margin: 0 auto;
|
63 |
-
max-width: 640px;
|
64 |
}
|
65 |
-
"""
|
66 |
-
|
67 |
-
with gr.Blocks(css=css) as demo:
|
68 |
-
with gr.Column(elem_id="col-container"):
|
69 |
-
gr.Markdown(" # Text-to-Image Gradio Template")
|
70 |
-
|
71 |
-
with gr.Row():
|
72 |
-
prompt = gr.Text(
|
73 |
-
label="Prompt",
|
74 |
-
show_label=False,
|
75 |
-
max_lines=1,
|
76 |
-
placeholder="Enter your prompt",
|
77 |
-
container=False,
|
78 |
-
)
|
79 |
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
)
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
|
|
|
|
|
|
|
|
134 |
)
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
inputs=[
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
149 |
],
|
150 |
-
outputs=[
|
151 |
)
|
152 |
|
153 |
if __name__ == "__main__":
|
154 |
-
demo.launch()
|
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
2 |
import torch
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
from transformers import AutoProcessor, AutoModelForCausalLM, AutoModelForVision2Seq
|
6 |
+
from qwen_vl_utils import process_vision_info
|
7 |
+
|
8 |
+
# Model configuration
|
9 |
+
MODEL_PATH = "nvidia/Cosmos-Reason1-7B"
|
10 |
+
|
11 |
+
# Role configurations
|
12 |
+
ROLES = {
|
13 |
+
"General Assistant": "You are a helpful assistant. Answer the question in the following format: \n<think>\nyour reasoning\n</think>\n\n<answer>\nyour answer\n</answer>.",
|
14 |
+
"Video Analyzer": """You are a helpful video analyzer. The goal is to identify artifacts and anomalies in the video. Watch carefully and focus on the following details:
|
15 |
+
|
16 |
+
* Physical accuracy (gravity, collision, object interaction, fluid dynamics, object permanence, etc.)
|
17 |
+
* Common sense
|
18 |
+
* Cause-and-effect
|
19 |
+
* Temporal consistency
|
20 |
+
* Spatial consistency
|
21 |
+
* Human motion
|
22 |
+
* Material and Texture realism
|
23 |
+
|
24 |
+
Here are some examples of commonly found artifacts and anomalies:
|
25 |
+
|
26 |
+
* If objects penetrate each other, this indicates a failure in collision detection, object interaction, and physical accuracy.
|
27 |
+
* If hands penetrate each other, or hands pass through objects, this indicates a failure in collision detection, object interaction, and physical accuracy.
|
28 |
+
* If an object moves in an unexpected way or move without any apparent reason, this suggests a failure in causality, object interaction, and physical accuracy.
|
29 |
+
* If an object suddenly flips or changes direction, this suggests a failure in temporal consistency.
|
30 |
+
* If an object suddenly appears or disappears, or the count of objects in the video suddenly changes, this suggests a failure in temporal consistency.
|
31 |
+
* If an object transforms or deforms half way through the video, this suggests a failure in temporal consistency.
|
32 |
+
* If an object is used in a way that defies its intended purpose or normal function, this indicates a violation of common sense.
|
33 |
+
* If the liquid flows through a solid object, such as water flowing through a pan, this suggests a failure in physical accuracy and fluid dynamics.
|
34 |
+
* If a person's legs or arms suddenly switch positions in an impossible way—such as the left leg appearing where the right leg was just a moment ago, this suggests a failure in human motion and temporal consistency.
|
35 |
+
* If a person's body suddenly morphs or changes shape, this suggests a failure in human motion and temporal consistency.
|
36 |
+
* If an object's texture, material or surface is unnaturally smooth, this suggests a failure in object surface reconstruction.
|
37 |
|
38 |
+
Here are some examples of non-artifacts you should not include in your analysis:
|
39 |
+
|
40 |
+
* Being an animated video, such as a cartoon, does not automatically make it artifacts.
|
41 |
+
* Avoid ungrounded and over-general explanations such as overall impression, artistic style, or background elements.
|
42 |
+
* The video has no sound. Avoid explanations based on sound.
|
43 |
+
* Do not mention lighting, shadows, blurring, or camera effects in your analysis.
|
44 |
+
|
45 |
+
Answer the question in English with provided options in the following format:
|
46 |
+
<think>\nyour reasoning\n</think>\n\n<answer>\nyour answer\n</answer>.""",
|
47 |
+
"Custom Role": "You are a helpful assistant. Answer the question in the following format: \n<think>\nyour reasoning\n</think>\n\n<answer>\nyour answer\n</answer>."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
+
# Default configuration
|
51 |
+
default_config = {
|
52 |
+
"attention_mode": "sdpa",
|
53 |
+
"torch_dtype": "float16",
|
54 |
+
"device_map": "auto",
|
55 |
+
"trust_remote_code": True
|
56 |
+
}
|
57 |
|
58 |
+
# Load or create config file
|
59 |
+
config_file = "cosmos_reason1_config.json"
|
60 |
+
try:
|
61 |
+
if not os.path.exists(config_file):
|
62 |
+
with open(config_file, "w") as f:
|
63 |
+
json.dump(default_config, f, indent=4)
|
64 |
+
config = default_config
|
65 |
+
else:
|
66 |
+
with open(config_file, "r") as f:
|
67 |
+
config = json.load(f)
|
68 |
+
except Exception as e:
|
69 |
+
print(f"Warning: Could not load config file: {e}")
|
70 |
+
print("Using default configuration")
|
71 |
+
config = default_config
|
72 |
|
73 |
+
# Initialize the model with configuration
|
74 |
+
try:
|
75 |
+
model = AutoModelForVision2Seq.from_pretrained(
|
76 |
+
MODEL_PATH,
|
77 |
+
torch_dtype=getattr(torch, config["torch_dtype"]),
|
78 |
+
device_map=config["device_map"],
|
79 |
+
trust_remote_code=config["trust_remote_code"]
|
80 |
+
)
|
81 |
+
except Exception as e:
|
82 |
+
print(f"Error loading model: {e}")
|
83 |
+
raise
|
84 |
|
85 |
+
# Initialize sampling parameters
|
86 |
+
generation_config = {
|
87 |
+
"temperature": 0.6,
|
88 |
+
"top_p": 0.95,
|
89 |
+
"repetition_penalty": 1.05,
|
90 |
+
"max_new_tokens": 4096,
|
91 |
+
}
|
92 |
|
93 |
+
# Initialize the processor
|
94 |
+
try:
|
95 |
+
processor = AutoProcessor.from_pretrained(MODEL_PATH)
|
96 |
+
except Exception as e:
|
97 |
+
print(f"Error loading processor: {e}")
|
98 |
+
raise
|
99 |
|
100 |
+
def process_input(image, video, text_prompt, temperature, top_p, repetition_penalty, max_tokens, role, custom_role_text):
|
101 |
+
"""Process the input and generate a response."""
|
102 |
+
try:
|
103 |
+
# Use custom role text if role is "Custom Role"
|
104 |
+
role_prompt = custom_role_text if role == "Custom Role" else ROLES[role]
|
105 |
+
|
106 |
+
messages = [
|
107 |
+
{"role": "system", "content": role_prompt},
|
108 |
+
{"role": "user", "content": []}
|
109 |
+
]
|
110 |
+
|
111 |
+
# Add text prompt
|
112 |
+
if text_prompt:
|
113 |
+
messages[1]["content"].append({"type": "text", "text": text_prompt})
|
114 |
+
|
115 |
+
# Add image if provided
|
116 |
+
if image is not None:
|
117 |
+
messages[1]["content"].append({"type": "image", "image": image})
|
118 |
+
|
119 |
+
# Add video if provided
|
120 |
+
if video is not None:
|
121 |
+
messages[1]["content"].append({
|
122 |
+
"type": "video",
|
123 |
+
"video": video,
|
124 |
+
"fps": 4,
|
125 |
+
})
|
126 |
+
|
127 |
+
# Process the prompt
|
128 |
+
prompt = processor.apply_chat_template(
|
129 |
+
messages,
|
130 |
+
tokenize=False,
|
131 |
+
add_generation_prompt=True,
|
132 |
+
)
|
133 |
+
|
134 |
+
# Process vision information
|
135 |
+
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
|
136 |
+
|
137 |
+
# Prepare inputs
|
138 |
+
inputs = processor(
|
139 |
+
text=prompt,
|
140 |
+
images=image_inputs if image_inputs is not None else None,
|
141 |
+
videos=video_inputs if video_inputs is not None else None,
|
142 |
+
return_tensors="pt"
|
143 |
+
).to(model.device)
|
144 |
+
|
145 |
+
# Update generation config with user parameters
|
146 |
+
current_generation_config = {
|
147 |
+
"temperature": temperature,
|
148 |
+
"top_p": top_p,
|
149 |
+
"repetition_penalty": repetition_penalty,
|
150 |
+
"max_new_tokens": max_tokens,
|
151 |
+
}
|
152 |
+
|
153 |
+
# Generate response
|
154 |
+
with torch.no_grad():
|
155 |
+
outputs = model.generate(
|
156 |
+
**inputs,
|
157 |
+
**current_generation_config
|
158 |
+
)
|
159 |
+
|
160 |
+
generated_text = processor.decode(outputs[0], skip_special_tokens=True)
|
161 |
+
|
162 |
+
return generated_text, "✅ Generation completed successfully!"
|
163 |
+
except Exception as e:
|
164 |
+
import traceback
|
165 |
+
error_trace = traceback.format_exc()
|
166 |
+
return f"Error processing input: {str(e)}", f"❌ Error occurred:\n{error_trace}"
|
167 |
|
168 |
+
def apply_config_changes(attention_mode, torch_dtype, device_map):
|
169 |
+
"""Apply configuration changes and save to file."""
|
170 |
+
try:
|
171 |
+
config = {
|
172 |
+
"attention_mode": attention_mode,
|
173 |
+
"torch_dtype": torch_dtype,
|
174 |
+
"device_map": device_map,
|
175 |
+
"trust_remote_code": True
|
176 |
+
}
|
177 |
+
|
178 |
+
with open(config_file, "w") as f:
|
179 |
+
json.dump(config, f, indent=4)
|
180 |
+
|
181 |
+
return "Configuration updated. Please restart the application for changes to take effect."
|
182 |
+
except Exception as e:
|
183 |
+
return f"Error updating configuration: {str(e)}"
|
184 |
|
185 |
+
# Create the Gradio interface
|
186 |
+
with gr.Blocks(title="Cosmos-Reason1", theme=gr.themes.Soft()) as demo:
|
187 |
+
gr.Markdown("# Cosmos-Reason1")
|
188 |
+
gr.Markdown("Upload an image or video and ask a question about it.")
|
189 |
+
|
190 |
+
with gr.Accordion("Model Configuration", open=False):
|
191 |
+
attention_mode = gr.Dropdown(
|
192 |
+
choices=["sdpa", "xformers", "flash_attention_2"],
|
193 |
+
value=config["attention_mode"],
|
194 |
+
label="Attention Mode"
|
195 |
+
)
|
196 |
+
torch_dtype = gr.Dropdown(
|
197 |
+
choices=["float16", "bfloat16", "float32"],
|
198 |
+
value=config["torch_dtype"],
|
199 |
+
label="Torch Data Type"
|
200 |
+
)
|
201 |
+
device_map = gr.Dropdown(
|
202 |
+
choices=["auto", "cuda", "cpu"],
|
203 |
+
value=config["device_map"],
|
204 |
+
label="Device Map"
|
205 |
+
)
|
206 |
+
config_btn = gr.Button("Apply Configuration")
|
207 |
+
config_msg = gr.Markdown()
|
208 |
+
|
209 |
+
config_btn.click(
|
210 |
+
fn=apply_config_changes,
|
211 |
+
inputs=[attention_mode, torch_dtype, device_map],
|
212 |
+
outputs=config_msg
|
213 |
+
)
|
214 |
+
|
215 |
+
with gr.Row():
|
216 |
+
with gr.Column():
|
217 |
+
role_selector = gr.Dropdown(
|
218 |
+
choices=list(ROLES.keys()),
|
219 |
+
value="General Assistant",
|
220 |
+
label="Select Role"
|
221 |
+
)
|
222 |
+
|
223 |
+
custom_role_panel = gr.Group(visible=False)
|
224 |
+
with custom_role_panel:
|
225 |
+
custom_role_text = gr.Textbox(
|
226 |
+
label="Custom Role Instructions",
|
227 |
+
placeholder="Enter custom role instructions here...",
|
228 |
+
lines=10,
|
229 |
+
value=ROLES["Custom Role"]
|
230 |
)
|
231 |
+
apply_custom_role = gr.Button("Apply Custom Role")
|
232 |
+
custom_role_status = gr.Markdown()
|
233 |
+
|
234 |
+
def update_custom_role(text):
|
235 |
+
ROLES["Custom Role"] = text
|
236 |
+
return "Custom role updated successfully!"
|
237 |
+
|
238 |
+
apply_custom_role.click(
|
239 |
+
fn=update_custom_role,
|
240 |
+
inputs=[custom_role_text],
|
241 |
+
outputs=[custom_role_status]
|
242 |
)
|
243 |
+
|
244 |
+
def toggle_custom_role(role):
|
245 |
+
return gr.update(visible=(role == "Custom Role"))
|
246 |
+
|
247 |
+
role_selector.change(
|
248 |
+
fn=toggle_custom_role,
|
249 |
+
inputs=[role_selector],
|
250 |
+
outputs=[custom_role_panel]
|
251 |
+
)
|
252 |
+
|
253 |
+
image_input = gr.Image(label="Image Input", type="filepath")
|
254 |
+
video_input = gr.Video(label="Video Input")
|
255 |
+
text_input = gr.Textbox(label="Question", placeholder="Ask a question about the image or video...")
|
256 |
+
|
257 |
+
with gr.Accordion("Generation Parameters", open=False):
|
258 |
+
temperature = gr.Slider(0.1, 2.0, value=0.6, step=0.1, label="Temperature")
|
259 |
+
top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top P")
|
260 |
+
repetition_penalty = gr.Slider(1.0, 2.0, value=1.05, step=0.05, label="Repetition Penalty")
|
261 |
+
max_tokens = gr.Slider(64, 4096, value=4096, step=64, label="Max Tokens")
|
262 |
+
|
263 |
+
submit_btn = gr.Button("Submit")
|
264 |
+
|
265 |
+
with gr.Column():
|
266 |
+
output = gr.Textbox(label="Model Response", lines=10)
|
267 |
+
status = gr.Markdown(label="Status")
|
268 |
+
|
269 |
+
submit_btn.click(
|
270 |
+
fn=process_input,
|
271 |
inputs=[
|
272 |
+
image_input,
|
273 |
+
video_input,
|
274 |
+
text_input,
|
275 |
+
temperature,
|
276 |
+
top_p,
|
277 |
+
repetition_penalty,
|
278 |
+
max_tokens,
|
279 |
+
role_selector,
|
280 |
+
custom_role_text
|
281 |
],
|
282 |
+
outputs=[output, status]
|
283 |
)
|
284 |
|
285 |
if __name__ == "__main__":
|
286 |
+
demo.launch()
|