Spaces:
Sleeping
Sleeping
Upload youtube_utils.py
Browse files- youtube_utils.py +91 -0
youtube_utils.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# youtube_utils.py
|
2 |
+
import re
|
3 |
+
import torch
|
4 |
+
from transformers import BartForConditionalGeneration, BartTokenizer
|
5 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
6 |
+
from nltk.tokenize import sent_tokenize
|
7 |
+
import nltk
|
8 |
+
|
9 |
+
nltk.download('punkt')
|
10 |
+
|
11 |
+
def clean_text(text):
|
12 |
+
cleaned_text = re.sub(r'\s+', ' ', text)
|
13 |
+
cleaned_text = cleaned_text.replace("'", "")
|
14 |
+
return cleaned_text
|
15 |
+
|
16 |
+
def get_youtube_captions(video_id):
|
17 |
+
try:
|
18 |
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
19 |
+
full_transcript = ""
|
20 |
+
|
21 |
+
for transcript in transcript_list:
|
22 |
+
try:
|
23 |
+
english_transcript = transcript.translate('en').fetch()
|
24 |
+
for caption in english_transcript:
|
25 |
+
full_transcript += caption['text'] + " "
|
26 |
+
break
|
27 |
+
except Exception:
|
28 |
+
continue
|
29 |
+
|
30 |
+
return clean_text(full_transcript)
|
31 |
+
|
32 |
+
except Exception as e:
|
33 |
+
print(f"Error fetching captions: {e}")
|
34 |
+
return None
|
35 |
+
|
36 |
+
def summarize_large_text_with_bart(input_text):
|
37 |
+
model_name = "facebook/bart-large-cnn"
|
38 |
+
model = BartForConditionalGeneration.from_pretrained(model_name)
|
39 |
+
tokenizer = BartTokenizer.from_pretrained(model_name)
|
40 |
+
|
41 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
42 |
+
model.to(device)
|
43 |
+
|
44 |
+
input_tokens = tokenizer.encode(input_text, add_special_tokens=False)
|
45 |
+
total_input_length = len(input_tokens)
|
46 |
+
|
47 |
+
desired_min_length = int(total_input_length * 0.28)
|
48 |
+
desired_max_length = int(total_input_length * 0.40)
|
49 |
+
|
50 |
+
sentences = sent_tokenize(input_text)
|
51 |
+
max_chunk_length = 1024
|
52 |
+
overlap = 2
|
53 |
+
chunks = []
|
54 |
+
|
55 |
+
sentence_tokens = [tokenizer.encode(sentence, add_special_tokens=False) for sentence in sentences]
|
56 |
+
sentence_lengths = [len(tokens) for tokens in sentence_tokens]
|
57 |
+
|
58 |
+
i = 0
|
59 |
+
while i < len(sentences):
|
60 |
+
current_chunk = []
|
61 |
+
current_length = 0
|
62 |
+
start = i
|
63 |
+
|
64 |
+
while i < len(sentences) and current_length + sentence_lengths[i] <= max_chunk_length:
|
65 |
+
current_chunk.append(sentences[i])
|
66 |
+
current_length += sentence_lengths[i]
|
67 |
+
i += 1
|
68 |
+
|
69 |
+
if i < len(sentences):
|
70 |
+
i = i - overlap if i - overlap > start else start
|
71 |
+
|
72 |
+
chunks.append(' '.join(current_chunk))
|
73 |
+
|
74 |
+
summaries = []
|
75 |
+
for chunk in chunks:
|
76 |
+
inputs = tokenizer.encode(chunk, return_tensors='pt', max_length=1024, truncation=True).to(device)
|
77 |
+
|
78 |
+
with torch.no_grad():
|
79 |
+
summary_ids = model.generate(
|
80 |
+
inputs,
|
81 |
+
max_length=desired_max_length // len(chunks),
|
82 |
+
min_length=desired_min_length // len(chunks),
|
83 |
+
num_beams=4,
|
84 |
+
length_penalty=2.0,
|
85 |
+
no_repeat_ngram_size=3,
|
86 |
+
early_stopping=True
|
87 |
+
)
|
88 |
+
|
89 |
+
summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))
|
90 |
+
|
91 |
+
return ' '.join(summaries)
|