Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -4,8 +4,8 @@ from PIL import Image
|
|
4 |
from transformers import (
|
5 |
BlipProcessor,
|
6 |
BlipForConditionalGeneration,
|
7 |
-
|
8 |
-
|
9 |
)
|
10 |
from typing import Union
|
11 |
from gtts import gTTS
|
@@ -32,8 +32,8 @@ class ImageCaptionPipeline:
|
|
32 |
print(f"Время загрузки BLIP: {time.time() - start_time:.2f} секунд")
|
33 |
|
34 |
start_time = time.time()
|
35 |
-
self.translator_tokenizer =
|
36 |
-
self.translator_model =
|
37 |
print(f"Время загрузки переводчика: {time.time() - start_time:.2f} секунд")
|
38 |
|
39 |
def generate_captions(self, image: Union[str, Image.Image]) -> tuple:
|
@@ -41,6 +41,8 @@ class ImageCaptionPipeline:
|
|
41 |
if isinstance(image, str):
|
42 |
image = Image.open(image)
|
43 |
image = image.convert("RGB")
|
|
|
|
|
44 |
inputs = self.blip_processor(images=image, return_tensors="pt").to(self.device)
|
45 |
with torch.no_grad():
|
46 |
output_ids = self.blip_model.generate(**inputs, max_length=50, num_beams=2, early_stopping=True)
|
@@ -48,12 +50,10 @@ class ImageCaptionPipeline:
|
|
48 |
print(f"Время генерации английской подписи: {time.time() - start_time:.2f} секунд")
|
49 |
|
50 |
start_time = time.time()
|
51 |
-
self.translator_tokenizer.src_lang = "en"
|
52 |
translated_inputs = self.translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(self.device)
|
53 |
with torch.no_grad():
|
54 |
translated_ids = self.translator_model.generate(
|
55 |
**translated_inputs,
|
56 |
-
forced_bos_token_id=self.translator_tokenizer.get_lang_id("ru"),
|
57 |
max_length=50,
|
58 |
num_beams=2,
|
59 |
early_stopping=True
|
@@ -87,14 +87,14 @@ def generate_audio(english_caption: str, russian_caption: str, audio_language: s
|
|
87 |
text = russian_caption if audio_language == "Русский" else english_caption
|
88 |
return pipeline.generate_audio(text, audio_language)
|
89 |
|
90 |
-
with gr.Blocks(css=".btn {width: 200px; background-color: #
|
91 |
with gr.Row():
|
92 |
with gr.Column(scale=1, min_width=400, variant="panel"):
|
93 |
image = gr.Image(type="pil", label="Изображение", height=400, width=400)
|
94 |
submit_button = gr.Button("Сгенерировать описание", elem_classes="btn")
|
95 |
with gr.Column(scale=1, min_width=300):
|
96 |
-
english_caption = gr.Textbox(label="
|
97 |
-
russian_caption = gr.Textbox(label="
|
98 |
audio_button = gr.Button("Сгенерировать озвучку", elem_classes="btn")
|
99 |
with gr.Row():
|
100 |
audio_language = gr.Dropdown(choices=["Русский", "English"], label="Язык озвучки", value="Русский", scale=1, min_width=150, elem_classes="equal-height")
|
|
|
4 |
from transformers import (
|
5 |
BlipProcessor,
|
6 |
BlipForConditionalGeneration,
|
7 |
+
AutoTokenizer,
|
8 |
+
AutoModelForSeq2SeqLM
|
9 |
)
|
10 |
from typing import Union
|
11 |
from gtts import gTTS
|
|
|
32 |
print(f"Время загрузки BLIP: {time.time() - start_time:.2f} секунд")
|
33 |
|
34 |
start_time = time.time()
|
35 |
+
self.translator_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
|
36 |
+
self.translator_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru").to(self.device)
|
37 |
print(f"Время загрузки переводчика: {time.time() - start_time:.2f} секунд")
|
38 |
|
39 |
def generate_captions(self, image: Union[str, Image.Image]) -> tuple:
|
|
|
41 |
if isinstance(image, str):
|
42 |
image = Image.open(image)
|
43 |
image = image.convert("RGB")
|
44 |
+
# Compress image to recommended size (512x512)
|
45 |
+
image = image.resize((512, 512), Image.Resampling.LANCZOS)
|
46 |
inputs = self.blip_processor(images=image, return_tensors="pt").to(self.device)
|
47 |
with torch.no_grad():
|
48 |
output_ids = self.blip_model.generate(**inputs, max_length=50, num_beams=2, early_stopping=True)
|
|
|
50 |
print(f"Время генерации английской подписи: {time.time() - start_time:.2f} секунд")
|
51 |
|
52 |
start_time = time.time()
|
|
|
53 |
translated_inputs = self.translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(self.device)
|
54 |
with torch.no_grad():
|
55 |
translated_ids = self.translator_model.generate(
|
56 |
**translated_inputs,
|
|
|
57 |
max_length=50,
|
58 |
num_beams=2,
|
59 |
early_stopping=True
|
|
|
87 |
text = russian_caption if audio_language == "Русский" else english_caption
|
88 |
return pipeline.generate_audio(text, audio_language)
|
89 |
|
90 |
+
with gr.Blocks(css=".btn {width: 200px; background-color: #4B0082; color: white; border: none; padding: 10px 20px; text-align: center; font-size: 16px; margin: 0 auto; display: block;} .equal-height {height: 60px !important;") as iface:
|
91 |
with gr.Row():
|
92 |
with gr.Column(scale=1, min_width=400, variant="panel"):
|
93 |
image = gr.Image(type="pil", label="Изображение", height=400, width=400)
|
94 |
submit_button = gr.Button("Сгенерировать описание", elem_classes="btn")
|
95 |
with gr.Column(scale=1, min_width=300):
|
96 |
+
english_caption = gr.Textbox(label="Описание на English:", lines=2)
|
97 |
+
russian_caption = gr.Textbox(label="Описание на Русском:", lines=2)
|
98 |
audio_button = gr.Button("Сгенерировать озвучку", elem_classes="btn")
|
99 |
with gr.Row():
|
100 |
audio_language = gr.Dropdown(choices=["Русский", "English"], label="Язык озвучки", value="Русский", scale=1, min_width=150, elem_classes="equal-height")
|