Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -4,8 +4,8 @@ from PIL import Image
|
|
4 |
from transformers import (
|
5 |
BlipProcessor,
|
6 |
BlipForConditionalGeneration,
|
7 |
-
|
8 |
-
|
9 |
)
|
10 |
from typing import Union
|
11 |
from gtts import gTTS
|
@@ -14,8 +14,10 @@ import uuid
|
|
14 |
import time
|
15 |
import gc
|
16 |
|
|
|
17 |
torch.set_num_threads(2)
|
18 |
|
|
|
19 |
_pipeline = None
|
20 |
|
21 |
def init_pipeline():
|
@@ -33,8 +35,8 @@ class ImageCaptionPipeline:
|
|
33 |
print(f"Время загрузки BLIP: {time.time() - start_time:.2f} секунд")
|
34 |
|
35 |
start_time = time.time()
|
36 |
-
self.translator_tokenizer =
|
37 |
-
self.translator_model =
|
38 |
print(f"Время загрузки переводчика: {time.time() - start_time:.2f} секунд")
|
39 |
|
40 |
def generate_captions(self, image: Union[str, Image.Image]) -> tuple:
|
@@ -42,6 +44,7 @@ class ImageCaptionPipeline:
|
|
42 |
if isinstance(image, str):
|
43 |
image = Image.open(image)
|
44 |
image = image.convert("RGB")
|
|
|
45 |
inputs = self.blip_processor(images=image, return_tensors="pt").to(self.device)
|
46 |
with torch.no_grad():
|
47 |
output_ids = self.blip_model.generate(**inputs, max_length=50, num_beams=2, early_stopping=True)
|
@@ -49,9 +52,16 @@ class ImageCaptionPipeline:
|
|
49 |
print(f"Время генерации английской подписи: {time.time() - start_time:.2f} секунд")
|
50 |
|
51 |
start_time = time.time()
|
|
|
52 |
translated_inputs = self.translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(self.device)
|
53 |
with torch.no_grad():
|
54 |
-
translated_ids = self.translator_model.generate(
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
russian_caption = self.translator_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
|
56 |
print(f"Время перевода на русский: {time.time() - start_time:.2f} секунд")
|
57 |
|
@@ -79,21 +89,21 @@ def generate_audio(english_caption: str, russian_caption: str, audio_language: s
|
|
79 |
if not english_caption and not russian_caption:
|
80 |
return None
|
81 |
pipeline = init_pipeline()
|
82 |
-
text = russian_caption
|
83 |
return pipeline.generate_audio(text, audio_language)
|
84 |
|
85 |
-
with gr.Blocks() as iface:
|
86 |
with gr.Row():
|
87 |
-
with gr.Column(scale=1, min_width=
|
88 |
-
image = gr.Image(type="pil", label="Изображение", height=
|
89 |
-
submit_button = gr.Button("Сгенерировать описание", elem_classes="btn"
|
90 |
with gr.Column(scale=1, min_width=300):
|
91 |
english_caption = gr.Textbox(label="Подпись English:", lines=2)
|
92 |
russian_caption = gr.Textbox(label="Подпись Русский:", lines=2)
|
93 |
-
audio_button = gr.Button("Сгенерировать озвучку", elem_classes="btn"
|
94 |
with gr.Row():
|
95 |
-
audio_language = gr.Dropdown(choices=["Русский", "English"], label="Язык озвучки", value="Русский", scale=1)
|
96 |
-
audio_output = gr.Audio(label="Озвучка", scale=1, min_width=150)
|
97 |
|
98 |
submit_button.click(
|
99 |
fn=generate_captions,
|
|
|
4 |
from transformers import (
|
5 |
BlipProcessor,
|
6 |
BlipForConditionalGeneration,
|
7 |
+
M2M100Tokenizer,
|
8 |
+
M2M100ForConditionalGeneration
|
9 |
)
|
10 |
from typing import Union
|
11 |
from gtts import gTTS
|
|
|
14 |
import time
|
15 |
import gc
|
16 |
|
17 |
+
# Оптимизация CPU: установка числа потоков
|
18 |
torch.set_num_threads(2)
|
19 |
|
20 |
+
# Глобальная переменная для кэширования pipeline
|
21 |
_pipeline = None
|
22 |
|
23 |
def init_pipeline():
|
|
|
35 |
print(f"Время загрузки BLIP: {time.time() - start_time:.2f} секунд")
|
36 |
|
37 |
start_time = time.time()
|
38 |
+
self.translator_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
|
39 |
+
self.translator_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M").to(self.device)
|
40 |
print(f"Время загрузки переводчика: {time.time() - start_time:.2f} секунд")
|
41 |
|
42 |
def generate_captions(self, image: Union[str, Image.Image]) -> tuple:
|
|
|
44 |
if isinstance(image, str):
|
45 |
image = Image.open(image)
|
46 |
image = image.convert("RGB")
|
47 |
+
image = image.resize((384, 384)) # Рекомендованный размер для BLIP-large
|
48 |
inputs = self.blip_processor(images=image, return_tensors="pt").to(self.device)
|
49 |
with torch.no_grad():
|
50 |
output_ids = self.blip_model.generate(**inputs, max_length=50, num_beams=2, early_stopping=True)
|
|
|
52 |
print(f"Время генерации английской подписи: {time.time() - start_time:.2f} секунд")
|
53 |
|
54 |
start_time = time.time()
|
55 |
+
self.translator_tokenizer.src_lang = "en"
|
56 |
translated_inputs = self.translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(self.device)
|
57 |
with torch.no_grad():
|
58 |
+
translated_ids = self.translator_model.generate(
|
59 |
+
**translated_inputs,
|
60 |
+
forced_bos_token_id=self.translator_tokenizer.get_lang_id("ru"),
|
61 |
+
max_length=50,
|
62 |
+
num_beams=2,
|
63 |
+
early_stopping=True
|
64 |
+
)
|
65 |
russian_caption = self.translator_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
|
66 |
print(f"Время перевода на русский: {time.time() - start_time:.2f} секунд")
|
67 |
|
|
|
89 |
if not english_caption and not russian_caption:
|
90 |
return None
|
91 |
pipeline = init_pipeline()
|
92 |
+
text = russian_caption if audio_language == "Русский" else english_caption
|
93 |
return pipeline.generate_audio(text, audio_language)
|
94 |
|
95 |
+
with gr.Blocks(css=".btn {width: 200px; background-color: #4682B4; color: white; border: none; padding: 10px 20px; text-align: center; font-size: 16px;} .equal-height { height: 40px; }") as iface:
|
96 |
with gr.Row():
|
97 |
+
with gr.Column(scale=1, min_width=400, variant="panel"):
|
98 |
+
image = gr.Image(type="pil", label="Изображение", height=400, width=400)
|
99 |
+
submit_button = gr.Button("Сгенерировать описание", elem_classes="btn")
|
100 |
with gr.Column(scale=1, min_width=300):
|
101 |
english_caption = gr.Textbox(label="Подпись English:", lines=2)
|
102 |
russian_caption = gr.Textbox(label="Подпись Русский:", lines=2)
|
103 |
+
audio_button = gr.Button("Сгенерировать озвучку", elem_classes="btn")
|
104 |
with gr.Row():
|
105 |
+
audio_language = gr.Dropdown(choices=["Русский", "English"], label="Язык озвучки", value="Русский", scale=1, min_width=150, elem_classes="equal-height")
|
106 |
+
audio_output = gr.Audio(label="Озвучка", scale=1, min_width=150, elem_classes="equal-height")
|
107 |
|
108 |
submit_button.click(
|
109 |
fn=generate_captions,
|