Spaces:

Zguin
/

Kursovaia2025

Running

App Files Files Community

Zguin commited on May 29

Commit

262fa8a

verified ·

1 Parent(s): ac10dbc

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -41

app.py CHANGED Viewed

@@ -4,8 +4,12 @@ from PIL import Image
 from transformers import (
     BlipProcessor,
     BlipForConditionalGeneration,
     M2M100Tokenizer,
-    M2M100ForConditionalGeneration
 )
 from typing import Union
 from gtts import gTTS
@@ -14,60 +18,79 @@ import uuid
 import time
 import gc
-# Оптимизация CPU: установка числа потоков
 torch.set_num_threads(2)
-# Глобальная переменная для кэширования pipeline
 _pipeline = None
-def init_pipeline():
     global _pipeline
     if _pipeline is None:
-        _pipeline = ImageCaptionPipeline()
     return _pipeline
 class ImageCaptionPipeline:
-    def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         start_time = time.time()
-        self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large", use_fast=True)
-        self.blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(self.device)
-        print(f"Время загрузки BLIP: {time.time() - start_time:.2f} секунд")
         start_time = time.time()
-        self.translator_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
-        self.translator_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M").to(self.device)
-        print(f"Время загрузки переводчика: {time.time() - start_time:.2f} секунд")
-    def generate_captions(self, image: Union[str, Image.Image]) -> tuple:
         start_time = time.time()
         if isinstance(image, str):
             image = Image.open(image)
         image = image.convert("RGB")
-        image = image.resize((384, 384))  # Рекомендованный размер для BLIP-large
         inputs = self.blip_processor(images=image, return_tensors="pt").to(self.device)
         with torch.no_grad():
             output_ids = self.blip_model.generate(**inputs, max_length=50, num_beams=2, early_stopping=True)
             english_caption = self.blip_processor.decode(output_ids[0], skip_special_tokens=True)
         print(f"Время генерации английской подписи: {time.time() - start_time:.2f} секунд")
         start_time = time.time()
-        self.translator_tokenizer.src_lang = "en"
-        translated_inputs = self.translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(self.device)
-        with torch.no_grad():
-            translated_ids = self.translator_model.generate(
-                **translated_inputs,
-                forced_bos_token_id=self.translator_tokenizer.get_lang_id("ru"),
-                max_length=50,
-                num_beams=2,
-                early_stopping=True
-            )
-            russian_caption = self.translator_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
         print(f"Время перевода на русский: {time.time() - start_time:.2f} секунд")
-        # Освобождение памяти
         gc.collect()
-        return english_caption, russian_caption
     def generate_audio(self, text: str, language: str) -> str:
         start_time = time.time()
@@ -78,42 +101,57 @@ class ImageCaptionPipeline:
         print(f"Время генерации озвучки: {time.time() - start_time:.2f} секунд")
         return audio_path
-def generate_captions(image: Image.Image) -> tuple:
     if image is not None:
-        pipeline = init_pipeline()
-        english_caption, russian_caption = pipeline.generate_captions(image)
-        return english_caption, russian_caption, None
-    return "Загрузите изображение.", "Загрузите изображение.", None
-def generate_audio(english_caption: str, russian_caption: str, audio_language: str) -> str:
     if not english_caption and not russian_caption:
         return None
-    pipeline = init_pipeline()
     text = russian_caption if audio_language == "Русский" else english_caption
     return pipeline.generate_audio(text, audio_language)
-with gr.Blocks(css=".btn {width: 200px; background-color: #4682B4; color: white; border: none; padding: 10px 20px; text-align: center; font-size: 16px;} .equal-height { height: 40px; }") as iface:
     with gr.Row():
-        with gr.Column(scale=1, min_width=400, variant="panel"):
-            image = gr.Image(type="pil", label="Изображение", height=400, width=400)
             submit_button = gr.Button("Сгенерировать описание", elem_classes="btn")
         with gr.Column(scale=1, min_width=300):
             english_caption = gr.Textbox(label="Подпись English:", lines=2)
             russian_caption = gr.Textbox(label="Подпись Русский:", lines=2)
             audio_button = gr.Button("Сгенерировать озвучку", elem_classes="btn")
             with gr.Row():
                 audio_language = gr.Dropdown(choices=["Русский", "English"], label="Язык озвучки", value="Русский", scale=1, min_width=150, elem_classes="equal-height")
                 audio_output = gr.Audio(label="Озвучка", scale=1, min_width=150, elem_classes="equal-height")
     submit_button.click(
-        fn=generate_captions,
-        inputs=[image],
         outputs=[english_caption, russian_caption, audio_output]
     )
     audio_button.click(
         fn=generate_audio,
-        inputs=[english_caption, russian_caption, audio_language],
         outputs=[audio_output]
     )

 from transformers import (
     BlipProcessor,
     BlipForConditionalGeneration,
+    Blip2Processor,
+    Blip2ForConditionalGeneration,
     M2M100Tokenizer,
+    M2M100ForConditionalGeneration,
+    AutoTokenizer,
+    AutoModelForSeq2SeqLM
 )
 from typing import Union
 from gtts import gTTS
 import time
 import gc
 torch.set_num_threads(2)
 _pipeline = None
+def init_pipeline(caption_model: str, translator_model: str):
     global _pipeline
     if _pipeline is None:
+        _pipeline = ImageCaptionPipeline(caption_model, translator_model)
     return _pipeline
 class ImageCaptionPipeline:
+    def __init__(self, caption_model: str, translator_model: str):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.caption_model = caption_model
+        self.translator_model = translator_model
         start_time = time.time()
+        if caption_model == "BLIP":
+            self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large", use_fast=True)
+            self.blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(self.device)
+        else:
+            self.blip_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+            self.blip_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b").to(self.device)
+        print(f"Время загрузки {caption_model}: {time.time() - start_time:.2f} секунд")
         start_time = time.time()
+        if translator_model == "M2M100":
+            self.translator_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
+            self.translator_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M").to(self.device)
+        else:
+            self.translator_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ru")
+            self.translator_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ru").to(self.device)
+        print(f"Время загрузки переводчика {translator_model}: {time.time() - start_time:.2f} секунд")
+    def generate_english_caption(self, image: Union[str, Image.Image]) -> str:
         start_time = time.time()
         if isinstance(image, str):
             image = Image.open(image)
         image = image.convert("RGB")
+        image = image.resize((384, 384))
         inputs = self.blip_processor(images=image, return_tensors="pt").to(self.device)
         with torch.no_grad():
             output_ids = self.blip_model.generate(**inputs, max_length=50, num_beams=2, early_stopping=True)
             english_caption = self.blip_processor.decode(output_ids[0], skip_special_tokens=True)
+            english_caption = english_caption[0].upper() + english_caption[1:] + ('.' if not english_caption.endswith('.') else '')
         print(f"Время генерации английской подписи: {time.time() - start_time:.2f} секунд")
+        gc.collect()
+        return english_caption
+    def translate_caption(self, english_caption: str) -> str:
         start_time = time.time()
+        if self.translator_model == "M2M100":
+            self.translator_tokenizer.src_lang = "en"
+            translated_inputs = self.translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(self.device)
+            with torch.no_grad():
+                translated_ids = self.translator_model.generate(
+                    **translated_inputs,
+                    forced_bos_token_id=self.translator_tokenizer.get_lang_id("ru"),
+                    max_length=50,
+                    num_beams=2,
+                    early_stopping=True
+                )
+                russian_caption = self.translator_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
+        else:
+            translated_inputs = self.translator_tokenizer(english_caption, return_tensors="pt", padding=True).to(self.device)
+            with torch.no_grad():
+                translated_ids = self.translator_model.generate(**translated_inputs, max_length=50, num_beams=2, early_stopping=True)
+                russian_caption = self.translator_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
+        russian_caption = russian_caption[0].upper() + russian_caption[1:] + ('.' if not russian_caption.endswith('.') else '')
         print(f"Время перевода на русский: {time.time() - start_time:.2f} секунд")
         gc.collect()
+        return russian_caption
     def generate_audio(self, text: str, language: str) -> str:
         start_time = time.time()
         print(f"Время генерации озвучки: {time.time() - start_time:.2f} секунд")
         return audio_path
+def generate_english_caption(image: Image.Image, caption_model: str, translator_model: str) -> tuple:
     if image is not None:
+        pipeline = init_pipeline(caption_model, translator_model)
+        english_caption = pipeline.generate_english_caption(image)
+        return english_caption, "", None
+    return "Загрузите изображение.", "", None
+def generate_translation(english_caption: str, caption_model: str, translator_model: str) -> str:
+    if not english_caption or english_caption == "Загрузите изображение.":
+        return ""
+    pipeline = init_pipeline(caption_model, translator_model)
+    return pipeline.translate_caption(english_caption)
+def generate_audio(english_caption: str, russian_caption: str, audio_language: str, caption_model: str, translator_model: str) -> str:
     if not english_caption and not russian_caption:
         return None
+    pipeline = init_pipeline(caption_model, translator_model)
     text = russian_caption if audio_language == "Русский" else english_caption
     return pipeline.generate_audio(text, audio_language)
+with gr.Blocks(css=".btn {width: 200px; background-color: #4B0082; color: white; border: none; padding: 10px 20px; text-align: center; font-size: 16px; margin: 10px auto; display: block;} .equal-height { height: 60px; }") as iface:
     with gr.Row():
+        with gr.Column(scale=1, min_width=250, variant="panel"):
+            image = gr.Image(type="pil", label="Изображение", height=250, width=250)
+            caption_model = gr.Dropdown(choices=["BLIP", "BLIP-2"], label="Модель описания", value="BLIP")
             submit_button = gr.Button("Сгенерировать описание", elem_classes="btn")
         with gr.Column(scale=1, min_width=300):
             english_caption = gr.Textbox(label="Подпись English:", lines=2)
             russian_caption = gr.Textbox(label="Подпись Русский:", lines=2)
+            translator_model = gr.Dropdown(choices=["M2M100", "Helsinki"], label="Модель перевода", value="M2M100")
+            translate_button = gr.Button("Сгенерировать перевод", elem_classes="btn")
             audio_button = gr.Button("Сгенерировать озвучку", elem_classes="btn")
             with gr.Row():
                 audio_language = gr.Dropdown(choices=["Русский", "English"], label="Язык озвучки", value="Русский", scale=1, min_width=150, elem_classes="equal-height")
                 audio_output = gr.Audio(label="Озвучка", scale=1, min_width=150, elem_classes="equal-height")
     submit_button.click(
+        fn=generate_english_caption,
+        inputs=[image, caption_model, translator_model],
         outputs=[english_caption, russian_caption, audio_output]
     )
+    translate_button.click(
+        fn=generate_translation,
+        inputs=[english_caption, caption_model, translator_model],
+        outputs=[russian_caption]
+    )
     audio_button.click(
         fn=generate_audio,
+        inputs=[english_caption, russian_caption, audio_language, caption_model, translator_model],
         outputs=[audio_output]
     )