Spaces:
Sleeping
Sleeping
# youtube_utils.py | |
import re | |
import torch | |
from transformers import BartForConditionalGeneration, BartTokenizer | |
from youtube_transcript_api import YouTubeTranscriptApi | |
from nltk.tokenize import sent_tokenize | |
import nltk | |
nltk.download('punkt') | |
def clean_text(text): | |
cleaned_text = re.sub(r'\s+', ' ', text) | |
cleaned_text = cleaned_text.replace("'", "") | |
return cleaned_text | |
def get_youtube_captions(video_id): | |
try: | |
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) | |
full_transcript = "" | |
for transcript in transcript_list: | |
try: | |
english_transcript = transcript.translate('en').fetch() | |
for caption in english_transcript: | |
full_transcript += caption['text'] + " " | |
break | |
except Exception: | |
continue | |
return clean_text(full_transcript) | |
except Exception as e: | |
print(f"Error fetching captions: {e}") | |
return None | |
def summarize_large_text_with_bart(input_text): | |
model_name = "facebook/bart-large-cnn" | |
model = BartForConditionalGeneration.from_pretrained(model_name) | |
tokenizer = BartTokenizer.from_pretrained(model_name) | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
model.to(device) | |
input_tokens = tokenizer.encode(input_text, add_special_tokens=False) | |
total_input_length = len(input_tokens) | |
desired_min_length = int(total_input_length * 0.28) | |
desired_max_length = int(total_input_length * 0.40) | |
sentences = sent_tokenize(input_text) | |
max_chunk_length = 1024 | |
overlap = 2 | |
chunks = [] | |
sentence_tokens = [tokenizer.encode(sentence, add_special_tokens=False) for sentence in sentences] | |
sentence_lengths = [len(tokens) for tokens in sentence_tokens] | |
i = 0 | |
while i < len(sentences): | |
current_chunk = [] | |
current_length = 0 | |
start = i | |
while i < len(sentences) and current_length + sentence_lengths[i] <= max_chunk_length: | |
current_chunk.append(sentences[i]) | |
current_length += sentence_lengths[i] | |
i += 1 | |
if i < len(sentences): | |
i = i - overlap if i - overlap > start else start | |
chunks.append(' '.join(current_chunk)) | |
summaries = [] | |
for chunk in chunks: | |
inputs = tokenizer.encode(chunk, return_tensors='pt', max_length=1024, truncation=True).to(device) | |
with torch.no_grad(): | |
summary_ids = model.generate( | |
inputs, | |
max_length=desired_max_length // len(chunks), | |
min_length=desired_min_length // len(chunks), | |
num_beams=4, | |
length_penalty=2.0, | |
no_repeat_ngram_size=3, | |
early_stopping=True | |
) | |
summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True)) | |
return ' '.join(summaries) |