Spaces:

Zguin
/

Kursovaia2025

Running

App Files Files Community

Zguin commited on May 29

Commit

d98c8c7

verified ·

1 Parent(s): e5b771f

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -27

app.py CHANGED Viewed

@@ -10,55 +10,107 @@ from transformers import (
 from typing import Union
 from gtts import gTTS
 import os
 class ImageCaptionPipeline:
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base", use_fast=True)
-        self.blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(self.device)
         self.translator_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
         self.translator_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru").to(self.device)
-    def generate_caption(self, image: Union[str, Image.Image], language: str = "Русский") -> str:
         if isinstance(image, str):
             image = Image.open(image)
         image = image.convert("RGB")
         inputs = self.blip_processor(images=image, return_tensors="pt").to(self.device)
         with torch.no_grad():
-            output_ids = self.blip_model.generate(**inputs, max_length=200, num_beams=4)
             english_caption = self.blip_processor.decode(output_ids[0], skip_special_tokens=True)
-        if language == "Русский":
-            translated_inputs = self.translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(self.device)
-            with torch.no_grad():
-                translated_ids = self.translator_model.generate(**translated_inputs, max_length=200, num_beams=4)
-                russian_caption = self.translator_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
-            return russian_caption
-        return english_caption
-def app(image: Image.Image, language: str) -> tuple:
-    if image is not None:
-        pipeline = ImageCaptionPipeline()
-        caption = pipeline.generate_caption(image, language=language)
         lang_code = "ru" if language == "Русский" else "en"
-        tts = gTTS(text=caption, lang=lang_code)
-        audio_path = "caption_audio.mp3"
         tts.save(audio_path)
-        return caption, audio_path
-    return "Загрузите изображение и выберите язык для получения подписи.", None
 with gr.Blocks() as iface:
     gr.Markdown("# Генератор подписей")
-    gr.Markdown("Загрузите изображение и выберите язык.")
-    language = gr.Dropdown(choices=["Русский", "English"], label="Язык", value="Русский")
     image = gr.Image(type="pil", label="Изображение", height=400, width=400)
-    submit_button = gr.Button("Сгенерировать", elem_classes="btn")
-    caption_output = gr.Textbox(label="Подпись")
     audio_output = gr.Audio(label="Озвучка")
     submit_button.click(
-        fn=app,
-        inputs=[image, language],
-        outputs=[caption_output, audio_output]
     )
 if __name__ == "__main__":

 from typing import Union
 from gtts import gTTS
 import os
+import uuid
+import time
+import gc
+# Оптимизация CPU: установка числа потоков
+torch.set_num_threads(2)
+# Глобальная переменная для кэширования pipeline
+_pipeline = None
+def init_pipeline():
+    global _pipeline
+    if _pipeline is None:
+        _pipeline = ImageCaptionPipeline()
+    return _pipeline
 class ImageCaptionPipeline:
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        start_time = time.time()
+        self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large", use_fast=True)
+        self.blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(self.device)
+        print(f"Время загрузки BLIP: {time.time() - start_time:.2f} секунд")
+        start_time = time.time()
         self.translator_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
         self.translator_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru").to(self.device)
+        print(f"Время загрузки переводчика: {time.time() - start_time:.2f} секунд")
+    def generate_captions(self, image: Union[str, Image.Image]) -> tuple:
+        start_time = time.time()
         if isinstance(image, str):
             image = Image.open(image)
         image = image.convert("RGB")
         inputs = self.blip_processor(images=image, return_tensors="pt").to(self.device)
         with torch.no_grad():
+            output_ids = self.blip_model.generate(**inputs, max_length=50, num_beams=2, early_stopping=True)
             english_caption = self.blip_processor.decode(output_ids[0], skip_special_tokens=True)
+        print(f"Время генерации английской подписи: {time.time() - start_time:.2f} секунд")
+        start_time = time.time()
+        translated_inputs = self.translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(self.device)
+        with torch.no_grad():
+            translated_ids = self.translator_model.generate(**translated_inputs, max_length=50, num_beams=2, early_stopping=True)
+            russian_caption = self.translator_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
+        print(f"Время перевода на русский: {time.time() - start_time:.2f} секунд")
+        # Освобождение памяти
+        gc.collect()
+        return english_caption, russian_caption
+    def generate_audio(self, text: str, language: str) -> str:
+        start_time = time.time()
         lang_code = "ru" if language == "Русский" else "en"
+        tts = gTTS(text=text, lang=lang_code)
+        audio_path = f"caption_audio_{uuid.uuid4()}.mp3"
         tts.save(audio_path)
+        print(f"Время генерации озвучки: {time.time() - start_time:.2f} секунд")
+        return audio_path
+def generate_captions(image: Image.Image) -> tuple:
+    if image is not None:
+        pipeline = init_pipeline()
+        english_caption, russian_caption = pipeline.generate_captions(image)
+        return f"English: {english_caption}", f"Русский: {russian_caption}", None
+    return "Загрузите изображение.", "Загрузите изображение.", None
+def generate_audio(english_caption: str, russian_caption: str, audio_language: str) -> str:
+    if not english_caption and not russian_caption:
+        return None
+    pipeline = init_pipeline()
+    text = russian_caption.replace("Русский: ", "") if audio_language == "Русский" else english_caption.replace("English: ", "")
+    return pipeline.generate_audio(text, audio_language)
 with gr.Blocks() as iface:
     gr.Markdown("# Генератор подписей")
+    gr.Markdown("Загрузите изображение для получения подписей на двух языках.")
     image = gr.Image(type="pil", label="Изображение", height=400, width=400)
+    submit_button = gr.Button("Сгенерировать подписи", elem_classes="btn")
+    with gr.Row():
+        english_caption = gr.Textbox(label="Подпись (English)")
+        russian_caption = gr.Textbox(label="Подпись (Русский)")
+    with gr.Row():
+        audio_language = gr.Dropdown(choices=["Русский", "English"], label="Язык озвучки", value="Русский")
+        audio_button = gr.Button("Сгенерировать озвучку", elem_classes="btn")
     audio_output = gr.Audio(label="Озвучка")
     submit_button.click(
+        fn=generate_captions,
+        inputs=[image],
+        outputs=[english_caption, russian_caption, audio_output]
+    )
+    audio_button.click(
+        fn=generate_audio,
+        inputs=[english_caption, russian_caption, audio_language],
+        outputs=[audio_output]
     )
 if __name__ == "__main__":