Spaces:

maliahson
/

Youtube_Video_Summerizer

Sleeping

App Files Files Community

maliahson commited on Dec 9, 2024

Commit

5bd0b3b

verified ·

1 Parent(s): ba5072f

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -115

app.py CHANGED Viewed

@@ -1,16 +1,9 @@
-import re
-import torch
 import gradio as gr
-from transformers import BartForConditionalGeneration, BartTokenizer
-from youtube_transcript_api import YouTubeTranscriptApi
-from nltk.tokenize import sent_tokenize
-import nltk
-# Download NLTK data for tokenization
-nltk.download('punkt')
-# Function to extract YouTube video ID
 def extract_video_id(url):
     patterns = [
         r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',  # Standard and shortened URLs
         r'(?:embed\/)([0-9A-Za-z_-]{11})',   # Embed URLs
@@ -23,123 +16,56 @@ def extract_video_id(url):
             return match.group(1)
     return None
-# Function to clean up text
-def clean_text(text):
-    cleaned_text = re.sub(r'\s+', ' ', text)
-    cleaned_text = cleaned_text.replace("'", "")
-    return cleaned_text
-# Fetch captions from YouTube video
-def get_youtube_captions(video_id):
     try:
-        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
-        full_transcript = ""
-        for transcript in transcript_list:
-            try:
-                english_transcript = transcript.translate('en').fetch()
-                for caption in english_transcript:
-                    full_transcript += caption['text'] + " "
-                break
-            except Exception:
-                continue
-        return clean_text(full_transcript)
-    except Exception as e:
-        print(f"Error fetching captions: {e}")
-        return None
-# Summarize large text with BART model
-def summarize_large_text_with_bart(input_text):
-    model_name = "facebook/bart-large-cnn"
-    model = BartForConditionalGeneration.from_pretrained(model_name)
-    tokenizer = BartTokenizer.from_pretrained(model_name)
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    model.to(device)
-    input_tokens = tokenizer.encode(input_text, add_special_tokens=False)
-    total_input_length = len(input_tokens)
-    desired_min_length = int(total_input_length * 0.28)
-    desired_max_length = int(total_input_length * 0.40)
-    sentences = sent_tokenize(input_text)
-    max_chunk_length = 1024
-    overlap = 2
-    chunks = []
-    sentence_tokens = [tokenizer.encode(sentence, add_special_tokens=False) for sentence in sentences]
-    sentence_lengths = [len(tokens) for tokens in sentence_tokens]
-    i = 0
-    while i < len(sentences):
-        current_chunk = []
-        current_length = 0
-        start = i
-        while i < len(sentences) and current_length + sentence_lengths[i] <= max_chunk_length:
-            current_chunk.append(sentences[i])
-            current_length += sentence_lengths[i]
-            i += 1
-        if i < len(sentences):
-            i = i - overlap if i - overlap > start else start
-        chunks.append(' '.join(current_chunk))
-    summaries = []
-    for chunk in chunks:
-        inputs = tokenizer.encode(chunk, return_tensors='pt', max_length=1024, truncation=True).to(device)
-        with torch.no_grad():
-            summary_ids = model.generate(
-                inputs,
-                max_length=desired_max_length // len(chunks),
-                min_length=desired_min_length // len(chunks),
-                num_beams=4,
-                length_penalty=2.0,
-                no_repeat_ngram_size=3,
-                early_stopping=True
-            )
-        summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))
-    return ' '.join(summaries)
-# Gradio Interface
-def summarize_video_gradio(video_url):
-    try:
-        # Extract video ID from URL
         video_id = extract_video_id(video_url)
         if not video_id:
-            return "Invalid YouTube URL. Please check the URL and try again."
-        # Get captions for the video
         captions = get_youtube_captions(video_id)
         if not captions:
-            return "Unable to fetch video captions. Ensure the video has subtitles available."
         # Generate summary
         summary = summarize_large_text_with_bart(captions)
-        return f"**Video ID:** {video_id}\n\n**Summary:**\n{summary}"
     except Exception as e:
-        return f"An error occurred: {str(e)}"
-# Gradio Interface Setup
-with gr.Blocks() as interface:
-    gr.Markdown("## YouTube Video Summarizer")
-    gr.Markdown("Paste a YouTube video URL below to fetch captions and summarize the content.")
-    video_url_input = gr.Textbox(label="YouTube Video URL", placeholder="Enter YouTube URL here...")
-    summarize_button = gr.Button("Summarize")
-    output = gr.Textbox(label="Output", lines=10)
-    summarize_button.click(summarize_video_gradio, inputs=video_url_input, outputs=output)
-# Launch the Gradio app
-if __name__ == "__main__":
-    interface.launch()

 import gradio as gr
+import re
+from youtube_utils import get_youtube_captions, summarize_large_text_with_bart
 def extract_video_id(url):
+    # Handle different YouTube URL formats
     patterns = [
         r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',  # Standard and shortened URLs
         r'(?:embed\/)([0-9A-Za-z_-]{11})',   # Embed URLs
             return match.group(1)
     return None
+def summarize_video(video_url):
     try:
+        # Extract video ID
         video_id = extract_video_id(video_url)
         if not video_id:
+            return "Error: Invalid YouTube URL.", None
+        # Get captions
         captions = get_youtube_captions(video_id)
         if not captions:
+            return "Error: Unable to fetch video captions.", None
         # Generate summary
         summary = summarize_large_text_with_bart(captions)
+        # Create embed URL for video preview
+        video_embed_url = f"https://www.youtube.com/embed/{video_id}"
+        return summary, video_embed_url
     except Exception as e:
+        return f"Error: {str(e)}", None
+# Gradio interface
+with gr.Blocks() as demo:
+    with gr.Row():
+        gr.Markdown("# YouTube Video Summarizer")
+    with gr.Row():
+        video_url_input = gr.Textbox(label="YouTube Video URL", placeholder="Enter YouTube video URL")
+    with gr.Row():
+        summarize_button = gr.Button("Summarize Video")
+    with gr.Row():
+        summary_output = gr.Textbox(label="Summary", interactive=False, lines=10)
+    with gr.Row():
+        video_preview = gr.HTML(label="Video Preview")
+    def handle_button_click(video_url):
+        summary, video_embed_url = summarize_video(video_url)
+        video_html = f'<iframe width="560" height="315" src="{video_embed_url}" frameborder="0" allowfullscreen></iframe>' if video_embed_url else ""
+        return summary, video_html
+    summarize_button.click(
+        fn=handle_button_click,
+        inputs=[video_url_input],
+        outputs=[summary_output, video_preview]
+    )
+# Launch the app
+demo.launch()