maliahson commited on
Commit
5bd0b3b
·
verified ·
1 Parent(s): ba5072f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -115
app.py CHANGED
@@ -1,16 +1,9 @@
1
- import re
2
- import torch
3
  import gradio as gr
4
- from transformers import BartForConditionalGeneration, BartTokenizer
5
- from youtube_transcript_api import YouTubeTranscriptApi
6
- from nltk.tokenize import sent_tokenize
7
- import nltk
8
-
9
- # Download NLTK data for tokenization
10
- nltk.download('punkt')
11
 
12
- # Function to extract YouTube video ID
13
  def extract_video_id(url):
 
14
  patterns = [
15
  r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', # Standard and shortened URLs
16
  r'(?:embed\/)([0-9A-Za-z_-]{11})', # Embed URLs
@@ -23,123 +16,56 @@ def extract_video_id(url):
23
  return match.group(1)
24
  return None
25
 
26
- # Function to clean up text
27
- def clean_text(text):
28
- cleaned_text = re.sub(r'\s+', ' ', text)
29
- cleaned_text = cleaned_text.replace("'", "")
30
- return cleaned_text
31
-
32
- # Fetch captions from YouTube video
33
- def get_youtube_captions(video_id):
34
  try:
35
- transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
36
- full_transcript = ""
37
-
38
- for transcript in transcript_list:
39
- try:
40
- english_transcript = transcript.translate('en').fetch()
41
- for caption in english_transcript:
42
- full_transcript += caption['text'] + " "
43
- break
44
- except Exception:
45
- continue
46
-
47
- return clean_text(full_transcript)
48
-
49
- except Exception as e:
50
- print(f"Error fetching captions: {e}")
51
- return None
52
-
53
- # Summarize large text with BART model
54
- def summarize_large_text_with_bart(input_text):
55
- model_name = "facebook/bart-large-cnn"
56
- model = BartForConditionalGeneration.from_pretrained(model_name)
57
- tokenizer = BartTokenizer.from_pretrained(model_name)
58
-
59
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
60
- model.to(device)
61
-
62
- input_tokens = tokenizer.encode(input_text, add_special_tokens=False)
63
- total_input_length = len(input_tokens)
64
-
65
- desired_min_length = int(total_input_length * 0.28)
66
- desired_max_length = int(total_input_length * 0.40)
67
-
68
- sentences = sent_tokenize(input_text)
69
- max_chunk_length = 1024
70
- overlap = 2
71
- chunks = []
72
-
73
- sentence_tokens = [tokenizer.encode(sentence, add_special_tokens=False) for sentence in sentences]
74
- sentence_lengths = [len(tokens) for tokens in sentence_tokens]
75
-
76
- i = 0
77
- while i < len(sentences):
78
- current_chunk = []
79
- current_length = 0
80
- start = i
81
-
82
- while i < len(sentences) and current_length + sentence_lengths[i] <= max_chunk_length:
83
- current_chunk.append(sentences[i])
84
- current_length += sentence_lengths[i]
85
- i += 1
86
-
87
- if i < len(sentences):
88
- i = i - overlap if i - overlap > start else start
89
-
90
- chunks.append(' '.join(current_chunk))
91
-
92
- summaries = []
93
- for chunk in chunks:
94
- inputs = tokenizer.encode(chunk, return_tensors='pt', max_length=1024, truncation=True).to(device)
95
-
96
- with torch.no_grad():
97
- summary_ids = model.generate(
98
- inputs,
99
- max_length=desired_max_length // len(chunks),
100
- min_length=desired_min_length // len(chunks),
101
- num_beams=4,
102
- length_penalty=2.0,
103
- no_repeat_ngram_size=3,
104
- early_stopping=True
105
- )
106
-
107
- summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))
108
-
109
- return ' '.join(summaries)
110
-
111
- # Gradio Interface
112
- def summarize_video_gradio(video_url):
113
- try:
114
- # Extract video ID from URL
115
  video_id = extract_video_id(video_url)
116
-
117
  if not video_id:
118
- return "Invalid YouTube URL. Please check the URL and try again."
119
 
120
- # Get captions for the video
121
  captions = get_youtube_captions(video_id)
122
  if not captions:
123
- return "Unable to fetch video captions. Ensure the video has subtitles available."
124
 
125
  # Generate summary
126
  summary = summarize_large_text_with_bart(captions)
127
- return f"**Video ID:** {video_id}\n\n**Summary:**\n{summary}"
 
 
 
 
128
 
129
  except Exception as e:
130
- return f"An error occurred: {str(e)}"
131
 
132
- # Gradio Interface Setup
133
- with gr.Blocks() as interface:
134
- gr.Markdown("## YouTube Video Summarizer")
135
- gr.Markdown("Paste a YouTube video URL below to fetch captions and summarize the content.")
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
- video_url_input = gr.Textbox(label="YouTube Video URL", placeholder="Enter YouTube URL here...")
138
- summarize_button = gr.Button("Summarize")
139
- output = gr.Textbox(label="Output", lines=10)
 
140
 
141
- summarize_button.click(summarize_video_gradio, inputs=video_url_input, outputs=output)
 
 
 
 
142
 
143
- # Launch the Gradio app
144
- if __name__ == "__main__":
145
- interface.launch()
 
 
 
1
  import gradio as gr
2
+ import re
3
+ from youtube_utils import get_youtube_captions, summarize_large_text_with_bart
 
 
 
 
 
4
 
 
5
  def extract_video_id(url):
6
+ # Handle different YouTube URL formats
7
  patterns = [
8
  r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', # Standard and shortened URLs
9
  r'(?:embed\/)([0-9A-Za-z_-]{11})', # Embed URLs
 
16
  return match.group(1)
17
  return None
18
 
19
+ def summarize_video(video_url):
 
 
 
 
 
 
 
20
  try:
21
+ # Extract video ID
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  video_id = extract_video_id(video_url)
 
23
  if not video_id:
24
+ return "Error: Invalid YouTube URL.", None
25
 
26
+ # Get captions
27
  captions = get_youtube_captions(video_id)
28
  if not captions:
29
+ return "Error: Unable to fetch video captions.", None
30
 
31
  # Generate summary
32
  summary = summarize_large_text_with_bart(captions)
33
+
34
+ # Create embed URL for video preview
35
+ video_embed_url = f"https://www.youtube.com/embed/{video_id}"
36
+
37
+ return summary, video_embed_url
38
 
39
  except Exception as e:
40
+ return f"Error: {str(e)}", None
41
 
42
+ # Gradio interface
43
+ with gr.Blocks() as demo:
44
+ with gr.Row():
45
+ gr.Markdown("# YouTube Video Summarizer")
46
+
47
+ with gr.Row():
48
+ video_url_input = gr.Textbox(label="YouTube Video URL", placeholder="Enter YouTube video URL")
49
+
50
+ with gr.Row():
51
+ summarize_button = gr.Button("Summarize Video")
52
+
53
+ with gr.Row():
54
+ summary_output = gr.Textbox(label="Summary", interactive=False, lines=10)
55
+
56
+ with gr.Row():
57
+ video_preview = gr.HTML(label="Video Preview")
58
 
59
+ def handle_button_click(video_url):
60
+ summary, video_embed_url = summarize_video(video_url)
61
+ video_html = f'<iframe width="560" height="315" src="{video_embed_url}" frameborder="0" allowfullscreen></iframe>' if video_embed_url else ""
62
+ return summary, video_html
63
 
64
+ summarize_button.click(
65
+ fn=handle_button_click,
66
+ inputs=[video_url_input],
67
+ outputs=[summary_output, video_preview]
68
+ )
69
 
70
+ # Launch the app
71
+ demo.launch()