Zguin commited on
Commit
ac10dbc
·
verified ·
1 Parent(s): ff96f1e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -13
app.py CHANGED
@@ -4,8 +4,8 @@ from PIL import Image
4
  from transformers import (
5
  BlipProcessor,
6
  BlipForConditionalGeneration,
7
- AutoTokenizer,
8
- AutoModelForSeq2SeqLM
9
  )
10
  from typing import Union
11
  from gtts import gTTS
@@ -14,8 +14,10 @@ import uuid
14
  import time
15
  import gc
16
 
 
17
  torch.set_num_threads(2)
18
 
 
19
  _pipeline = None
20
 
21
  def init_pipeline():
@@ -33,8 +35,8 @@ class ImageCaptionPipeline:
33
  print(f"Время загрузки BLIP: {time.time() - start_time:.2f} секунд")
34
 
35
  start_time = time.time()
36
- self.translator_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
37
- self.translator_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru").to(self.device)
38
  print(f"Время загрузки переводчика: {time.time() - start_time:.2f} секунд")
39
 
40
  def generate_captions(self, image: Union[str, Image.Image]) -> tuple:
@@ -42,6 +44,7 @@ class ImageCaptionPipeline:
42
  if isinstance(image, str):
43
  image = Image.open(image)
44
  image = image.convert("RGB")
 
45
  inputs = self.blip_processor(images=image, return_tensors="pt").to(self.device)
46
  with torch.no_grad():
47
  output_ids = self.blip_model.generate(**inputs, max_length=50, num_beams=2, early_stopping=True)
@@ -49,9 +52,16 @@ class ImageCaptionPipeline:
49
  print(f"Время генерации английской подписи: {time.time() - start_time:.2f} секунд")
50
 
51
  start_time = time.time()
 
52
  translated_inputs = self.translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(self.device)
53
  with torch.no_grad():
54
- translated_ids = self.translator_model.generate(**translated_inputs, max_length=50, num_beams=2, early_stopping=True)
 
 
 
 
 
 
55
  russian_caption = self.translator_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
56
  print(f"Время перевода на русский: {time.time() - start_time:.2f} секунд")
57
 
@@ -79,21 +89,21 @@ def generate_audio(english_caption: str, russian_caption: str, audio_language: s
79
  if not english_caption and not russian_caption:
80
  return None
81
  pipeline = init_pipeline()
82
- text = russian_caption.replace("Русский: ", "") if audio_language == "Русский" else english_caption.replace("English: ", "")
83
  return pipeline.generate_audio(text, audio_language)
84
 
85
- with gr.Blocks() as iface:
86
  with gr.Row():
87
- with gr.Column(scale=1, min_width=600, variant="panel"):
88
- image = gr.Image(type="pil", label="Изображение", height=600, width=600)
89
- submit_button = gr.Button("Сгенерировать описание", elem_classes="btn", size="sm")
90
  with gr.Column(scale=1, min_width=300):
91
  english_caption = gr.Textbox(label="Подпись English:", lines=2)
92
  russian_caption = gr.Textbox(label="Подпись Русский:", lines=2)
93
- audio_button = gr.Button("Сгенерировать озвучку", elem_classes="btn", size="sm")
94
  with gr.Row():
95
- audio_language = gr.Dropdown(choices=["Русский", "English"], label="Язык озвучки", value="Русский", scale=1)
96
- audio_output = gr.Audio(label="Озвучка", scale=1, min_width=150)
97
 
98
  submit_button.click(
99
  fn=generate_captions,
 
4
  from transformers import (
5
  BlipProcessor,
6
  BlipForConditionalGeneration,
7
+ M2M100Tokenizer,
8
+ M2M100ForConditionalGeneration
9
  )
10
  from typing import Union
11
  from gtts import gTTS
 
14
  import time
15
  import gc
16
 
17
+ # Оптимизация CPU: установка числа потоков
18
  torch.set_num_threads(2)
19
 
20
+ # Глобальная переменная для кэширования pipeline
21
  _pipeline = None
22
 
23
  def init_pipeline():
 
35
  print(f"Время загрузки BLIP: {time.time() - start_time:.2f} секунд")
36
 
37
  start_time = time.time()
38
+ self.translator_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
39
+ self.translator_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M").to(self.device)
40
  print(f"Время загрузки переводчика: {time.time() - start_time:.2f} секунд")
41
 
42
  def generate_captions(self, image: Union[str, Image.Image]) -> tuple:
 
44
  if isinstance(image, str):
45
  image = Image.open(image)
46
  image = image.convert("RGB")
47
+ image = image.resize((384, 384)) # Рекомендованный размер для BLIP-large
48
  inputs = self.blip_processor(images=image, return_tensors="pt").to(self.device)
49
  with torch.no_grad():
50
  output_ids = self.blip_model.generate(**inputs, max_length=50, num_beams=2, early_stopping=True)
 
52
  print(f"Время генерации английской подписи: {time.time() - start_time:.2f} секунд")
53
 
54
  start_time = time.time()
55
+ self.translator_tokenizer.src_lang = "en"
56
  translated_inputs = self.translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(self.device)
57
  with torch.no_grad():
58
+ translated_ids = self.translator_model.generate(
59
+ **translated_inputs,
60
+ forced_bos_token_id=self.translator_tokenizer.get_lang_id("ru"),
61
+ max_length=50,
62
+ num_beams=2,
63
+ early_stopping=True
64
+ )
65
  russian_caption = self.translator_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
66
  print(f"Время перевода на русский: {time.time() - start_time:.2f} секунд")
67
 
 
89
  if not english_caption and not russian_caption:
90
  return None
91
  pipeline = init_pipeline()
92
+ text = russian_caption if audio_language == "Русский" else english_caption
93
  return pipeline.generate_audio(text, audio_language)
94
 
95
+ with gr.Blocks(css=".btn {width: 200px; background-color: #4682B4; color: white; border: none; padding: 10px 20px; text-align: center; font-size: 16px;} .equal-height { height: 40px; }") as iface:
96
  with gr.Row():
97
+ with gr.Column(scale=1, min_width=400, variant="panel"):
98
+ image = gr.Image(type="pil", label="Изображение", height=400, width=400)
99
+ submit_button = gr.Button("Сгенерировать описание", elem_classes="btn")
100
  with gr.Column(scale=1, min_width=300):
101
  english_caption = gr.Textbox(label="Подпись English:", lines=2)
102
  russian_caption = gr.Textbox(label="Подпись Русский:", lines=2)
103
+ audio_button = gr.Button("Сгенерировать озвучку", elem_classes="btn")
104
  with gr.Row():
105
+ audio_language = gr.Dropdown(choices=["Русский", "English"], label="Язык озвучки", value="Русский", scale=1, min_width=150, elem_classes="equal-height")
106
+ audio_output = gr.Audio(label="Озвучка", scale=1, min_width=150, elem_classes="equal-height")
107
 
108
  submit_button.click(
109
  fn=generate_captions,