Spaces:
Running
Running
# app.py | |
from transformers import BlipProcessor, BlipForConditionalGeneration | |
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM | |
from deep_translator import GoogleTranslator | |
from gtts import gTTS | |
import gradio as gr | |
from PIL import Image | |
import torch | |
# π Device configuration | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
# π Load BLIP-1 model | |
processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-base') | |
model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base') | |
# π Load grammar polishing model | |
grammar_tokenizer = T5Tokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws") | |
grammar_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws").to(device) | |
# π· Generate image caption | |
def generate_caption(image): | |
inputs = processor(images=image, return_tensors="pt") | |
out = model.generate(**inputs) | |
caption = processor.decode(out[0], skip_special_tokens=True) | |
# Remove duplicate words | |
seen = set() | |
cleaned = [] | |
for word in caption.split(): | |
lw = word.lower() | |
if lw not in seen: | |
cleaned.append(word) | |
seen.add(lw) | |
return " ".join(cleaned) | |
# β¨ Polish grammar | |
def polish_grammar(text): | |
input_text = f"paraphrase: {text} </s>" | |
input_ids = grammar_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device) | |
outputs = grammar_model.generate( | |
input_ids, | |
max_length=64, | |
num_beams=5, | |
num_return_sequences=1, | |
no_repeat_ngram_size=2, | |
early_stopping=True | |
) | |
return grammar_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# π Translate caption using Google | |
def translate_caption_google(text, target_lang="te"): | |
return GoogleTranslator(source='auto', target=target_lang).translate(text) | |
# π Generate voice narration | |
def generate_tts(text, lang="en", filename="caption.mp3"): | |
tts = gTTS(text=text, lang=lang) | |
path = f"/tmp/{filename}" | |
tts.save(path) | |
return path | |
# π― Final function for Gradio | |
def caption_image(image, language): | |
raw = generate_caption(image) | |
polished = polish_grammar(raw) | |
lang_codes = { | |
"Telugu": "te", | |
"Hindi": "hi", | |
"French": "fr", | |
"Spanish": "es" | |
} | |
translated = translate_caption_google(polished, target_lang=lang_codes[language]) | |
audio_path = generate_tts(polished, lang="en") | |
return raw, polished, translated, audio_path | |
# π Gradio Interface | |
demo = gr.Interface( | |
fn=caption_image, | |
inputs=[ | |
gr.Image(type="pil", label="Upload Image"), | |
gr.Dropdown(["Telugu", "Hindi", "French", "Spanish"], label="Translation Language") | |
], | |
outputs=[ | |
gr.Text(label="Raw Caption"), | |
gr.Text(label="Polished Caption"), | |
gr.Text(label="Translated Caption"), | |
gr.Audio(label="Voice Narration (TTS)") | |
], | |
title="πΌοΈ AI Image Captioning with Voice Narration", | |
description="Upload an image to get a caption, grammar polishing, translation, and English voice narration." | |
) | |
# π Run the app | |
if __name__ == "__main__": | |
demo.launch(share=True, show_api=True) | |