sahithbaratam's picture
Update app.py
f40342c verified
# app.py
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM
from deep_translator import GoogleTranslator
from gtts import gTTS
import gradio as gr
from PIL import Image
import torch
# πŸ“Œ Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
# πŸ” Load BLIP-1 model
processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-base')
model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base')
# πŸ“ Load grammar polishing model
grammar_tokenizer = T5Tokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
grammar_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws").to(device)
# πŸ“· Generate image caption
def generate_caption(image):
inputs = processor(images=image, return_tensors="pt")
out = model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)
# Remove duplicate words
seen = set()
cleaned = []
for word in caption.split():
lw = word.lower()
if lw not in seen:
cleaned.append(word)
seen.add(lw)
return " ".join(cleaned)
# ✨ Polish grammar
def polish_grammar(text):
input_text = f"paraphrase: {text} </s>"
input_ids = grammar_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
outputs = grammar_model.generate(
input_ids,
max_length=64,
num_beams=5,
num_return_sequences=1,
no_repeat_ngram_size=2,
early_stopping=True
)
return grammar_tokenizer.decode(outputs[0], skip_special_tokens=True)
# 🌐 Translate caption using Google
def translate_caption_google(text, target_lang="te"):
return GoogleTranslator(source='auto', target=target_lang).translate(text)
# πŸ”Š Generate voice narration
def generate_tts(text, lang="en", filename="caption.mp3"):
tts = gTTS(text=text, lang=lang)
path = f"/tmp/{filename}"
tts.save(path)
return path
# 🎯 Final function for Gradio
def caption_image(image, language):
raw = generate_caption(image)
polished = polish_grammar(raw)
lang_codes = {
"Telugu": "te",
"Hindi": "hi",
"French": "fr",
"Spanish": "es"
}
translated = translate_caption_google(polished, target_lang=lang_codes[language])
audio_path = generate_tts(polished, lang="en")
return raw, polished, translated, audio_path
# πŸš€ Gradio Interface
demo = gr.Interface(
fn=caption_image,
inputs=[
gr.Image(type="pil", label="Upload Image"),
gr.Dropdown(["Telugu", "Hindi", "French", "Spanish"], label="Translation Language")
],
outputs=[
gr.Text(label="Raw Caption"),
gr.Text(label="Polished Caption"),
gr.Text(label="Translated Caption"),
gr.Audio(label="Voice Narration (TTS)")
],
title="πŸ–ΌοΈ AI Image Captioning with Voice Narration",
description="Upload an image to get a caption, grammar polishing, translation, and English voice narration."
)
# πŸ” Run the app
if __name__ == "__main__":
demo.launch(share=True, show_api=True)