Spaces:

maliahson
/

Youtube_Video_Summerizer

Sleeping

App Files Files Community

Youtube_Video_Summerizer / app.py

maliahson

Update app.py

f4c0ad6 verified 3 months ago

raw

history blame

4.88 kB

	import re
	import torch
	import gradio as gr
	from transformers import BartForConditionalGeneration, BartTokenizer
	from youtube_transcript_api import YouTubeTranscriptApi
	from nltk.tokenize import sent_tokenize
	import nltk

	# Download NLTK data for tokenization
	nltk.download('punkt')

	# Function to extract YouTube video ID
	def extract_video_id(url):
	patterns = [
	r'(?:v=\|\/)([0-9A-Za-z_-]{11}).*', # Standard and shortened URLs
	r'(?:embed\/)([0-9A-Za-z_-]{11})', # Embed URLs
	r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})' # Shortened URLs
	]

	for pattern in patterns:
	match = re.search(pattern, url)
	if match:
	return match.group(1)
	return None

	# Function to clean up text
	def clean_text(text):
	cleaned_text = re.sub(r'\s+', ' ', text)
	cleaned_text = cleaned_text.replace("'", "")
	return cleaned_text

	# Fetch captions from YouTube video
	def get_youtube_captions(video_id):
	try:
	transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
	full_transcript = ""

	for transcript in transcript_list:
	try:
	english_transcript = transcript.translate('en').fetch()
	for caption in english_transcript:
	full_transcript += caption['text'] + " "
	break
	except Exception:
	continue

	return clean_text(full_transcript)

	except Exception as e:
	print(f"Error fetching captions: {e}")
	return None

	# Summarize large text with BART model
	def summarize_large_text_with_bart(input_text):
	model_name = "facebook/bart-large-cnn"
	model = BartForConditionalGeneration.from_pretrained(model_name)
	tokenizer = BartTokenizer.from_pretrained(model_name)

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	model.to(device)

	input_tokens = tokenizer.encode(input_text, add_special_tokens=False)
	total_input_length = len(input_tokens)

	desired_min_length = int(total_input_length * 0.28)
	desired_max_length = int(total_input_length * 0.40)

	sentences = sent_tokenize(input_text)
	max_chunk_length = 1024
	overlap = 2
	chunks = []

	sentence_tokens = [tokenizer.encode(sentence, add_special_tokens=False) for sentence in sentences]
	sentence_lengths = [len(tokens) for tokens in sentence_tokens]

	i = 0
	while i < len(sentences):
	current_chunk = []
	current_length = 0
	start = i

	while i < len(sentences) and current_length + sentence_lengths[i] <= max_chunk_length:
	current_chunk.append(sentences[i])
	current_length += sentence_lengths[i]
	i += 1

	if i < len(sentences):
	i = i - overlap if i - overlap > start else start

	chunks.append(' '.join(current_chunk))

	summaries = []
	for chunk in chunks:
	inputs = tokenizer.encode(chunk, return_tensors='pt', max_length=1024, truncation=True).to(device)

	with torch.no_grad():
	summary_ids = model.generate(
	inputs,
	max_length=desired_max_length // len(chunks),
	min_length=desired_min_length // len(chunks),
	num_beams=4,
	length_penalty=2.0,
	no_repeat_ngram_size=3,
	early_stopping=True
	)

	summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))

	return ' '.join(summaries)

	# Gradio Interface
	def summarize_video_gradio(video_url):
	try:
	# Extract video ID from URL
	video_id = extract_video_id(video_url)

	if not video_id:
	return "Invalid YouTube URL. Please check the URL and try again."

	# Get captions for the video
	captions = get_youtube_captions(video_id)
	if not captions:
	return "Unable to fetch video captions. Ensure the video has subtitles available."

	# Generate summary
	summary = summarize_large_text_with_bart(captions)
	return f"Video ID: {video_id}\n\nSummary:\n{summary}"

	except Exception as e:
	return f"An error occurred: {str(e)}"

	# Gradio Interface Setup
	with gr.Blocks() as interface:
	gr.Markdown("## YouTube Video Summarizer")
	gr.Markdown("Paste a YouTube video URL below to fetch captions and summarize the content.")

	video_url_input = gr.Textbox(label="YouTube Video URL", placeholder="Enter YouTube URL here...")
	summarize_button = gr.Button("Summarize")
	output = gr.Textbox(label="Output", lines=10)

	summarize_button.click(summarize_video_gradio, inputs=video_url_input, outputs=output)

	# Launch the Gradio app
	if __name__ == "__main__":
	interface.launch()