maliahson's picture
Update app.py
f4c0ad6 verified
raw
history blame
4.88 kB
import re
import torch
import gradio as gr
from transformers import BartForConditionalGeneration, BartTokenizer
from youtube_transcript_api import YouTubeTranscriptApi
from nltk.tokenize import sent_tokenize
import nltk
# Download NLTK data for tokenization
nltk.download('punkt')
# Function to extract YouTube video ID
def extract_video_id(url):
patterns = [
r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', # Standard and shortened URLs
r'(?:embed\/)([0-9A-Za-z_-]{11})', # Embed URLs
r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})' # Shortened URLs
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None
# Function to clean up text
def clean_text(text):
cleaned_text = re.sub(r'\s+', ' ', text)
cleaned_text = cleaned_text.replace("'", "")
return cleaned_text
# Fetch captions from YouTube video
def get_youtube_captions(video_id):
try:
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
full_transcript = ""
for transcript in transcript_list:
try:
english_transcript = transcript.translate('en').fetch()
for caption in english_transcript:
full_transcript += caption['text'] + " "
break
except Exception:
continue
return clean_text(full_transcript)
except Exception as e:
print(f"Error fetching captions: {e}")
return None
# Summarize large text with BART model
def summarize_large_text_with_bart(input_text):
model_name = "facebook/bart-large-cnn"
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
input_tokens = tokenizer.encode(input_text, add_special_tokens=False)
total_input_length = len(input_tokens)
desired_min_length = int(total_input_length * 0.28)
desired_max_length = int(total_input_length * 0.40)
sentences = sent_tokenize(input_text)
max_chunk_length = 1024
overlap = 2
chunks = []
sentence_tokens = [tokenizer.encode(sentence, add_special_tokens=False) for sentence in sentences]
sentence_lengths = [len(tokens) for tokens in sentence_tokens]
i = 0
while i < len(sentences):
current_chunk = []
current_length = 0
start = i
while i < len(sentences) and current_length + sentence_lengths[i] <= max_chunk_length:
current_chunk.append(sentences[i])
current_length += sentence_lengths[i]
i += 1
if i < len(sentences):
i = i - overlap if i - overlap > start else start
chunks.append(' '.join(current_chunk))
summaries = []
for chunk in chunks:
inputs = tokenizer.encode(chunk, return_tensors='pt', max_length=1024, truncation=True).to(device)
with torch.no_grad():
summary_ids = model.generate(
inputs,
max_length=desired_max_length // len(chunks),
min_length=desired_min_length // len(chunks),
num_beams=4,
length_penalty=2.0,
no_repeat_ngram_size=3,
early_stopping=True
)
summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))
return ' '.join(summaries)
# Gradio Interface
def summarize_video_gradio(video_url):
try:
# Extract video ID from URL
video_id = extract_video_id(video_url)
if not video_id:
return "Invalid YouTube URL. Please check the URL and try again."
# Get captions for the video
captions = get_youtube_captions(video_id)
if not captions:
return "Unable to fetch video captions. Ensure the video has subtitles available."
# Generate summary
summary = summarize_large_text_with_bart(captions)
return f"**Video ID:** {video_id}\n\n**Summary:**\n{summary}"
except Exception as e:
return f"An error occurred: {str(e)}"
# Gradio Interface Setup
with gr.Blocks() as interface:
gr.Markdown("## YouTube Video Summarizer")
gr.Markdown("Paste a YouTube video URL below to fetch captions and summarize the content.")
video_url_input = gr.Textbox(label="YouTube Video URL", placeholder="Enter YouTube URL here...")
summarize_button = gr.Button("Summarize")
output = gr.Textbox(label="Output", lines=10)
summarize_button.click(summarize_video_gradio, inputs=video_url_input, outputs=output)
# Launch the Gradio app
if __name__ == "__main__":
interface.launch()