Sneha-Kaurav commited on
Commit
baee6a3
·
verified ·
1 Parent(s): 89a05fa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -79
app.py CHANGED
@@ -1,79 +1,79 @@
1
- import re
2
- import torch
3
- from transformers import T5ForConditionalGeneration, T5Tokenizer
4
- from youtube_transcript_api import YouTubeTranscriptApi
5
- from youtube_transcript_api.formatters import TextFormatter
6
- import gradio as gr
7
-
8
- # Load the T5 model and tokenizer
9
- model_name = "bilal521/t5-youtube-summarizer"
10
- model = T5ForConditionalGeneration.from_pretrained(model_name)
11
- tokenizer = T5Tokenizer.from_pretrained(model_name)
12
-
13
- # Clean and summarize text
14
- def summarize_with_t5(text):
15
- input_text = "summarize: " + text.strip()
16
- inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
17
-
18
- summary_ids = model.generate(
19
- inputs,
20
- max_length=256,
21
- min_length=80,
22
- num_beams=5,
23
- length_penalty=2.0,
24
- no_repeat_ngram_size=3,
25
- early_stopping=True
26
- )
27
-
28
- return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
29
-
30
- # Extract video ID from any YouTube URL
31
- def extract_video_id(url):
32
- regex = r"(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?|shorts)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})"
33
- match = re.search(regex, url)
34
- return match.group(1) if match else None
35
-
36
- # Optional: Clean up repeated or spammy lines
37
- def clean_transcript(text):
38
- lines = text.split("\n")
39
- seen = set()
40
- clean_lines = []
41
- for line in lines:
42
- line = line.strip()
43
- if not line or line.lower() in seen:
44
- continue
45
- if re.match(r'https?:\/\/', line):
46
- continue
47
- seen.add(line.lower())
48
- clean_lines.append(line)
49
- return " ".join(clean_lines)
50
-
51
- # Main logic to fetch transcript and summarize
52
- def get_youtube_transcript(video_url):
53
- video_id = extract_video_id(video_url)
54
- if not video_id:
55
- return "Could not extract video ID. Please check the URL."
56
-
57
- try:
58
- yt = YouTubeTranscriptApi()
59
- transcript = yt.fetch(video_id, languages=['en'])
60
-
61
- formatter = TextFormatter()
62
- raw_text = formatter.format_transcript(transcript)
63
- cleaned_text = clean_transcript(raw_text)
64
- summary = summarize_with_t5(cleaned_text)
65
- return summary
66
-
67
- except Exception as e:
68
- return f"Error occurred: {e}"
69
-
70
- # Gradio UI
71
- demo = gr.Interface(
72
- fn=get_youtube_transcript,
73
- inputs=[gr.Textbox(label="YouTube Video URL", lines=1, placeholder="Paste your YouTube URL here")],
74
- outputs=[gr.Textbox(label="Summarized Transcript", lines=10)],
75
- title="YouTube Video Summarizer",
76
- description="This app extracts and summarizes the transcript of a YouTube video using a fine-tuned T5 model."
77
- )
78
-
79
- demo.launch()
 
1
+ import re
2
+ import torch
3
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
4
+ from youtube_transcript_api import YouTubeTranscriptApi
5
+ from youtube_transcript_api.formatters import TextFormatter
6
+ import gradio as gr
7
+
8
+ # Load the T5 model and tokenizer
9
+ model_name = "bilal521/t5-youtube-summarizer"
10
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
11
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
12
+
13
+ # Clean and summarize text
14
+ def summarize_with_t5(text):
15
+ input_text = "summarize: " + text.strip()
16
+ inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
17
+
18
+ summary_ids = model.generate(
19
+ inputs,
20
+ max_length=256,
21
+ min_length=80,
22
+ num_beams=5,
23
+ length_penalty=2.0,
24
+ no_repeat_ngram_size=3,
25
+ early_stopping=True
26
+ )
27
+
28
+ return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
29
+
30
+ # Extract video ID from any YouTube URL
31
+ def extract_video_id(url):
32
+ regex = r"(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?|shorts)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})"
33
+ match = re.search(regex, url)
34
+ return match.group(1) if match else None
35
+
36
+ # Optional: Clean up repeated or spammy lines
37
+ def clean_transcript(text):
38
+ lines = text.split("\n")
39
+ seen = set()
40
+ clean_lines = []
41
+ for line in lines:
42
+ line = line.strip()
43
+ if not line or line.lower() in seen:
44
+ continue
45
+ if re.match(r'https?:\/\/', line):
46
+ continue
47
+ seen.add(line.lower())
48
+ clean_lines.append(line)
49
+ return " ".join(clean_lines)
50
+
51
+ # Main logic to fetch transcript and summarize
52
+ def get_youtube_transcript(video_url):
53
+ video_id = extract_video_id(video_url)
54
+ if not video_id:
55
+ return "Could not extract video ID. Please check the URL."
56
+
57
+ try:
58
+ yt = YouTubeTranscriptApi()
59
+ transcript = yt.fetch(video_id, languages=['en'])
60
+
61
+ formatter = TextFormatter()
62
+ raw_text = formatter.format_transcript(transcript)
63
+ cleaned_text = clean_transcript(raw_text)
64
+ summary = summarize_with_t5(cleaned_text)
65
+ return summary
66
+
67
+ except Exception as e:
68
+ return f"Error occurred: {e}"
69
+
70
+ # Gradio UI
71
+ demo = gr.Interface(
72
+ fn=get_youtube_transcript,
73
+ inputs=[gr.Textbox(label="YouTube Video URL", lines=1, placeholder="Paste your YouTube URL here")],
74
+ outputs=[gr.Textbox(label="Summarized Transcript", lines=10)],
75
+ title="YouTube Video Summarizer",
76
+ description="This app extracts and summarizes the transcript of a YouTube video using a fine-tuned T5 model."
77
+ )
78
+
79
+ demo.launch()