LPX55 commited on
Commit
24db381
Β·
verified Β·
1 Parent(s): 54fce6e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -41
app.py CHANGED
@@ -1,21 +1,19 @@
1
  import spaces
2
  import gradio as gr
3
- from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, AutoModelForVision2Seq
4
  from qwen_vl_utils import process_vision_info
5
  from PIL import Image
6
  import torch
7
  import os, time
8
- from peft import PeftModel
9
 
10
  # Load the model and processor
11
- model = AutoModelForVision2Seq.from_pretrained(
12
  "daniel3303/QwenStoryteller",
13
- torch_dtype=torch.bfloat16,
14
  device_map="auto"
15
  )
16
  processor = AutoProcessor.from_pretrained("daniel3303/QwenStoryteller")
17
 
18
-
19
  @spaces.GPU()
20
  @torch.no_grad()
21
  def generate_story(file_paths, progress=gr.Progress(track_tqdm=True)):
@@ -23,7 +21,7 @@ def generate_story(file_paths, progress=gr.Progress(track_tqdm=True)):
23
  images = [Image.open(file_path) for file_path in file_paths]
24
 
25
  image_content = []
26
- for img in images[:12]: # Limit to 6 images
27
  image_content.append({
28
  "type": "image",
29
  "image": img,
@@ -34,7 +32,7 @@ def generate_story(file_paths, progress=gr.Progress(track_tqdm=True)):
34
  messages = [
35
  {
36
  "role": "system",
37
- "content": "You are an English speaking AI storyteller that can analyze sequences of images and create creative narratives. First think step-by-step to analyze characters, objects, settings, and narrative structure. Then create a grounded story that maintains consistent character identity and object references across frames. Use `<think>` tags to show your reasoning process before writing the final story."
38
  },
39
  {
40
  "role": "user",
@@ -74,52 +72,48 @@ def generate_story(file_paths, progress=gr.Progress(track_tqdm=True)):
74
 
75
 
76
  with gr.Blocks(fill_height=True) as demo:
77
- gr.Markdown("# Qwen Storyteller \n## Upload up to 6 images to generate a creative story.")
78
 
79
  with gr.Row():
80
  with gr.Column():
81
- upload_button = gr.UploadButton("Upload up to 12 images", file_types=["image"], file_count="multiple")
82
  output_file = gr.File(label="Uploaded Files")
83
  gen_button = gr.Button("Generate", variant="primary")
84
 
85
  with gr.Column():
86
- outputs = gr.Markdown(label="Generated Story", show_copy_button=True, container=True)
87
 
88
  with gr.Row():
89
- with gr.Column():
90
- gr.Markdown(
91
- """
92
- ### Key Features
93
- * Cross-Frame Consistency: Maintains consistent character and object identity across multiple frames through visual similarity and face recognition techniques
94
- * Structured Reasoning: Employs chain-of-thought reasoning to analyze scenes with explicit modeling of characters, objects, settings, and narrative structure
95
- * Grounded Storytelling: Uses specialized XML tags to link narrative elements directly to visual entities
96
- * Reduced Hallucinations: Achieves 12.3% fewer hallucinations compared to the non-fine-tuned base model
97
-
98
- Model trained by daniel3303, [repository here.](https://huggingface.co/daniel3303/QwenStoryteller)
99
- """
100
- )
101
- with gr.Accordion():
102
- gr.Markdown(
103
- """
104
- ```
105
- @misc{oliveira2025storyreasoningdatasetusingchainofthought,
106
- title={StoryReasoning Dataset: Using Chain-of-Thought for Scene Understanding and Grounded Story Generation},
107
- author={Daniel A. P. Oliveira and David Martins de Matos},
108
- year={2025},
109
- eprint={2505.10292},
110
- archivePrefix={arXiv},
111
- primaryClass={cs.CV},
112
- url={https://arxiv.org/abs/2505.10292},
113
- }
114
- ```
115
- """
116
- )
117
- with gr.Column():
118
- gr.Markdown("")
119
 
120
  upload_button.upload(lambda files: [f.name for f in files], upload_button, output_file)
121
 
122
  gen_button.click(generate_story, upload_button, outputs)
123
 
124
  if __name__ == "__main__":
125
- demo.queue().launch(show_error=True)
 
1
  import spaces
2
  import gradio as gr
3
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
4
  from qwen_vl_utils import process_vision_info
5
  from PIL import Image
6
  import torch
7
  import os, time
 
8
 
9
  # Load the model and processor
10
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
11
  "daniel3303/QwenStoryteller",
12
+ torch_dtype=torch.float16,
13
  device_map="auto"
14
  )
15
  processor = AutoProcessor.from_pretrained("daniel3303/QwenStoryteller")
16
 
 
17
  @spaces.GPU()
18
  @torch.no_grad()
19
  def generate_story(file_paths, progress=gr.Progress(track_tqdm=True)):
 
21
  images = [Image.open(file_path) for file_path in file_paths]
22
 
23
  image_content = []
24
+ for img in images[:10]: # Limit to 6 images
25
  image_content.append({
26
  "type": "image",
27
  "image": img,
 
32
  messages = [
33
  {
34
  "role": "system",
35
+ "content": "You are an AI storyteller that can analyze sequences of images and create creative narratives. First think step-by-step to analyze characters, objects, settings, and narrative structure. Then create a grounded story that maintains consistent character identity and object references across frames. Use 🧠 tags to show your reasoning process before writing the final story."
36
  },
37
  {
38
  "role": "user",
 
72
 
73
 
74
  with gr.Blocks(fill_height=True) as demo:
75
+ gr.Markdown("# Qwen Storyteller \n## Upload up to 10 images to generate a creative story.")
76
 
77
  with gr.Row():
78
  with gr.Column():
79
+ upload_button = gr.UploadButton("Upload up to 10 images", file_types=["image"], file_count="multiple")
80
  output_file = gr.File(label="Uploaded Files")
81
  gen_button = gr.Button("Generate", variant="primary")
82
 
83
  with gr.Column():
84
+ outputs = gr.Markdown(label="Generated Story", show_copy_button=True)
85
 
86
  with gr.Row():
87
+ gr.Markdown(
88
+ """
89
+ ### Key Features
90
+ * Cross-Frame Consistency: Maintains consistent character and object identity across multiple frames through visual similarity and face recognition techniques
91
+ * Structured Reasoning: Employs chain-of-thought reasoning to analyze scenes with explicit modeling of characters, objects, settings, and narrative structure
92
+ * Grounded Storytelling: Uses specialized XML tags to link narrative elements directly to visual entities
93
+ * Reduced Hallucinations: Achieves 12.3% fewer hallucinations compared to the non-fine-tuned base model
94
+
95
+ Model trained by daniel3303, [repository here.](https://huggingface.co/daniel3303/QwenStoryteller)
96
+ """
97
+ )
98
+ gr.Markdown(
99
+ """
100
+ ```
101
+ @misc{oliveira2025storyreasoningdatasetusingchainofthought,
102
+ title={StoryReasoning Dataset: Using Chain-of-Thought for Scene Understanding and Grounded Story Generation},
103
+ author={Daniel A. P. Oliveira and David Martins de Matos},
104
+ year={2025},
105
+ eprint={2505.10292},
106
+ archivePrefix={arXiv},
107
+ primaryClass={cs.CV},
108
+ url={https://arxiv.org/abs/2505.10292},
109
+ }
110
+ ```
111
+ """
112
+ )
 
 
 
 
113
 
114
  upload_button.upload(lambda files: [f.name for f in files], upload_button, output_file)
115
 
116
  gen_button.click(generate_story, upload_button, outputs)
117
 
118
  if __name__ == "__main__":
119
+ demo.queue().launch(show_error=True)