Zguin commited on
Commit
98b7982
·
verified ·
1 Parent(s): e176f16

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -9
app.py CHANGED
@@ -4,8 +4,8 @@ from PIL import Image
4
  from transformers import (
5
  BlipProcessor,
6
  BlipForConditionalGeneration,
7
- M2M100Tokenizer,
8
- M2M100ForConditionalGeneration
9
  )
10
  from typing import Union
11
  from gtts import gTTS
@@ -32,8 +32,8 @@ class ImageCaptionPipeline:
32
  print(f"Время загрузки BLIP: {time.time() - start_time:.2f} секунд")
33
 
34
  start_time = time.time()
35
- self.translator_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
36
- self.translator_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M").to(self.device)
37
  print(f"Время загрузки переводчика: {time.time() - start_time:.2f} секунд")
38
 
39
  def generate_captions(self, image: Union[str, Image.Image]) -> tuple:
@@ -41,6 +41,8 @@ class ImageCaptionPipeline:
41
  if isinstance(image, str):
42
  image = Image.open(image)
43
  image = image.convert("RGB")
 
 
44
  inputs = self.blip_processor(images=image, return_tensors="pt").to(self.device)
45
  with torch.no_grad():
46
  output_ids = self.blip_model.generate(**inputs, max_length=50, num_beams=2, early_stopping=True)
@@ -48,12 +50,10 @@ class ImageCaptionPipeline:
48
  print(f"Время генерации английской подписи: {time.time() - start_time:.2f} секунд")
49
 
50
  start_time = time.time()
51
- self.translator_tokenizer.src_lang = "en"
52
  translated_inputs = self.translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(self.device)
53
  with torch.no_grad():
54
  translated_ids = self.translator_model.generate(
55
  **translated_inputs,
56
- forced_bos_token_id=self.translator_tokenizer.get_lang_id("ru"),
57
  max_length=50,
58
  num_beams=2,
59
  early_stopping=True
@@ -87,14 +87,14 @@ def generate_audio(english_caption: str, russian_caption: str, audio_language: s
87
  text = russian_caption if audio_language == "Русский" else english_caption
88
  return pipeline.generate_audio(text, audio_language)
89
 
90
- with gr.Blocks(css=".btn {width: 200px; background-color: #4682B4; color: white; border: none; padding: 10px 20px; text-align: center; font-size: 16px;}") as iface:
91
  with gr.Row():
92
  with gr.Column(scale=1, min_width=400, variant="panel"):
93
  image = gr.Image(type="pil", label="Изображение", height=400, width=400)
94
  submit_button = gr.Button("Сгенерировать описание", elem_classes="btn")
95
  with gr.Column(scale=1, min_width=300):
96
- english_caption = gr.Textbox(label="Подпись English:", lines=2)
97
- russian_caption = gr.Textbox(label="Подпись Русский:", lines=2)
98
  audio_button = gr.Button("Сгенерировать озвучку", elem_classes="btn")
99
  with gr.Row():
100
  audio_language = gr.Dropdown(choices=["Русский", "English"], label="Язык озвучки", value="Русский", scale=1, min_width=150, elem_classes="equal-height")
 
4
  from transformers import (
5
  BlipProcessor,
6
  BlipForConditionalGeneration,
7
+ AutoTokenizer,
8
+ AutoModelForSeq2SeqLM
9
  )
10
  from typing import Union
11
  from gtts import gTTS
 
32
  print(f"Время загрузки BLIP: {time.time() - start_time:.2f} секунд")
33
 
34
  start_time = time.time()
35
+ self.translator_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
36
+ self.translator_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru").to(self.device)
37
  print(f"Время загрузки переводчика: {time.time() - start_time:.2f} секунд")
38
 
39
  def generate_captions(self, image: Union[str, Image.Image]) -> tuple:
 
41
  if isinstance(image, str):
42
  image = Image.open(image)
43
  image = image.convert("RGB")
44
+ # Compress image to recommended size (512x512)
45
+ image = image.resize((512, 512), Image.Resampling.LANCZOS)
46
  inputs = self.blip_processor(images=image, return_tensors="pt").to(self.device)
47
  with torch.no_grad():
48
  output_ids = self.blip_model.generate(**inputs, max_length=50, num_beams=2, early_stopping=True)
 
50
  print(f"Время генерации английской подписи: {time.time() - start_time:.2f} секунд")
51
 
52
  start_time = time.time()
 
53
  translated_inputs = self.translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(self.device)
54
  with torch.no_grad():
55
  translated_ids = self.translator_model.generate(
56
  **translated_inputs,
 
57
  max_length=50,
58
  num_beams=2,
59
  early_stopping=True
 
87
  text = russian_caption if audio_language == "Русский" else english_caption
88
  return pipeline.generate_audio(text, audio_language)
89
 
90
+ with gr.Blocks(css=".btn {width: 200px; background-color: #4B0082; color: white; border: none; padding: 10px 20px; text-align: center; font-size: 16px; margin: 0 auto; display: block;} .equal-height {height: 60px !important;") as iface:
91
  with gr.Row():
92
  with gr.Column(scale=1, min_width=400, variant="panel"):
93
  image = gr.Image(type="pil", label="Изображение", height=400, width=400)
94
  submit_button = gr.Button("Сгенерировать описание", elem_classes="btn")
95
  with gr.Column(scale=1, min_width=300):
96
+ english_caption = gr.Textbox(label="Описание на English:", lines=2)
97
+ russian_caption = gr.Textbox(label="Описание на Русском:", lines=2)
98
  audio_button = gr.Button("Сгенерировать озвучку", elem_classes="btn")
99
  with gr.Row():
100
  audio_language = gr.Dropdown(choices=["Русский", "English"], label="Язык озвучки", value="Русский", scale=1, min_width=150, elem_classes="equal-height")