Spaces:

Sneha-Kaurav
/

Youtube_subs_summarizer

Sleeping

App Files Files Community

Sneha-Kaurav commited on 30 days ago

Commit

baee6a3

verified ·

1 Parent(s): 89a05fa

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -79

app.py CHANGED Viewed

@@ -1,79 +1,79 @@
-import re
-import torch
-from transformers import T5ForConditionalGeneration, T5Tokenizer
-from youtube_transcript_api import YouTubeTranscriptApi
-from youtube_transcript_api.formatters import TextFormatter
-import gradio as gr
-# Load the T5 model and tokenizer
-model_name = "bilal521/t5-youtube-summarizer"
-model = T5ForConditionalGeneration.from_pretrained(model_name)
-tokenizer = T5Tokenizer.from_pretrained(model_name)
-# Clean and summarize text
-def summarize_with_t5(text):
-    input_text = "summarize: " + text.strip()
-    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
-    summary_ids = model.generate(
-        inputs,
-        max_length=256,
-        min_length=80,
-        num_beams=5,
-        length_penalty=2.0,
-        no_repeat_ngram_size=3,
-        early_stopping=True
-    )
-    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-# Extract video ID from any YouTube URL
-def extract_video_id(url):
-    regex = r"(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?|shorts)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})"
-    match = re.search(regex, url)
-    return match.group(1) if match else None
-# Optional: Clean up repeated or spammy lines
-def clean_transcript(text):
-    lines = text.split("\n")
-    seen = set()
-    clean_lines = []
-    for line in lines:
-        line = line.strip()
-        if not line or line.lower() in seen:
-            continue
-        if re.match(r'https?:\/\/', line):
-            continue
-        seen.add(line.lower())
-        clean_lines.append(line)
-    return " ".join(clean_lines)
-# Main logic to fetch transcript and summarize
-def get_youtube_transcript(video_url):
-    video_id = extract_video_id(video_url)
-    if not video_id:
-        return "Could not extract video ID. Please check the URL."
-    try:
-        yt = YouTubeTranscriptApi()
-        transcript = yt.fetch(video_id, languages=['en'])
-        formatter = TextFormatter()
-        raw_text = formatter.format_transcript(transcript)
-        cleaned_text = clean_transcript(raw_text)
-        summary = summarize_with_t5(cleaned_text)
-        return summary
-    except Exception as e:
-        return f"Error occurred: {e}"
-# Gradio UI
-demo = gr.Interface(
-    fn=get_youtube_transcript,
-    inputs=[gr.Textbox(label="YouTube Video URL", lines=1, placeholder="Paste your YouTube URL here")],
-    outputs=[gr.Textbox(label="Summarized Transcript", lines=10)],
-    title="YouTube Video Summarizer",
-    description="This app extracts and summarizes the transcript of a YouTube video using a fine-tuned T5 model."
-)
-demo.launch()

+import re
+import torch
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+from youtube_transcript_api import YouTubeTranscriptApi
+from youtube_transcript_api.formatters import TextFormatter
+import gradio as gr
+# Load the T5 model and tokenizer
+model_name = "bilal521/t5-youtube-summarizer"
+model = T5ForConditionalGeneration.from_pretrained(model_name)
+tokenizer = T5Tokenizer.from_pretrained(model_name)
+# Clean and summarize text
+def summarize_with_t5(text):
+    input_text = "summarize: " + text.strip()
+    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
+    summary_ids = model.generate(
+        inputs,
+        max_length=256,
+        min_length=80,
+        num_beams=5,
+        length_penalty=2.0,
+        no_repeat_ngram_size=3,
+        early_stopping=True
+    )
+    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+# Extract video ID from any YouTube URL
+def extract_video_id(url):
+    regex = r"(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?|shorts)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})"
+    match = re.search(regex, url)
+    return match.group(1) if match else None
+# Optional: Clean up repeated or spammy lines
+def clean_transcript(text):
+    lines = text.split("\n")
+    seen = set()
+    clean_lines = []
+    for line in lines:
+        line = line.strip()
+        if not line or line.lower() in seen:
+            continue
+        if re.match(r'https?:\/\/', line):
+            continue
+        seen.add(line.lower())
+        clean_lines.append(line)
+    return " ".join(clean_lines)
+# Main logic to fetch transcript and summarize
+def get_youtube_transcript(video_url):
+    video_id = extract_video_id(video_url)
+    if not video_id:
+        return "Could not extract video ID. Please check the URL."
+    try:
+        yt = YouTubeTranscriptApi()
+        transcript = yt.fetch(video_id, languages=['en'])
+        formatter = TextFormatter()
+        raw_text = formatter.format_transcript(transcript)
+        cleaned_text = clean_transcript(raw_text)
+        summary = summarize_with_t5(cleaned_text)
+        return summary
+    except Exception as e:
+        return f"Error occurred: {e}"
+# Gradio UI
+demo = gr.Interface(
+    fn=get_youtube_transcript,
+    inputs=[gr.Textbox(label="YouTube Video URL", lines=1, placeholder="Paste your YouTube URL here")],
+    outputs=[gr.Textbox(label="Summarized Transcript", lines=10)],
+    title="YouTube Video Summarizer",
+    description="This app extracts and summarizes the transcript of a YouTube video using a fine-tuned T5 model."
+)
+demo.launch()