Spaces:

maliahson
/

Youtube_Video_Summerizer

Sleeping

App Files Files Community

maliahson commited on Dec 6, 2024

Commit

6c007b8

verified ·

1 Parent(s): 353620b

Update youtube_utils.py

Browse files

Files changed (1) hide show

youtube_utils.py +100 -91

youtube_utils.py CHANGED Viewed

@@ -1,91 +1,100 @@
-# youtube_utils.py
-import re
-import torch
-from transformers import BartForConditionalGeneration, BartTokenizer
-from youtube_transcript_api import YouTubeTranscriptApi
-from nltk.tokenize import sent_tokenize
-import nltk
-nltk.download('punkt')
-def clean_text(text):
-    cleaned_text = re.sub(r'\s+', ' ', text)
-    cleaned_text = cleaned_text.replace("'", "")
-    return cleaned_text
-def get_youtube_captions(video_id):
-    try:
-        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
-        full_transcript = ""
-        for transcript in transcript_list:
-            try:
-                english_transcript = transcript.translate('en').fetch()
-                for caption in english_transcript:
-                    full_transcript += caption['text'] + " "
-                break
-            except Exception:
-                continue
-        return clean_text(full_transcript)
-    except Exception as e:
-        print(f"Error fetching captions: {e}")
-        return None
-def summarize_large_text_with_bart(input_text):
-    model_name = "facebook/bart-large-cnn"
-    model = BartForConditionalGeneration.from_pretrained(model_name)
-    tokenizer = BartTokenizer.from_pretrained(model_name)
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    model.to(device)
-    input_tokens = tokenizer.encode(input_text, add_special_tokens=False)
-    total_input_length = len(input_tokens)
-    desired_min_length = int(total_input_length * 0.28)
-    desired_max_length = int(total_input_length * 0.40)
-    sentences = sent_tokenize(input_text)
-    max_chunk_length = 1024
-    overlap = 2
-    chunks = []
-    sentence_tokens = [tokenizer.encode(sentence, add_special_tokens=False) for sentence in sentences]
-    sentence_lengths = [len(tokens) for tokens in sentence_tokens]
-    i = 0
-    while i < len(sentences):
-        current_chunk = []
-        current_length = 0
-        start = i
-        while i < len(sentences) and current_length + sentence_lengths[i] <= max_chunk_length:
-            current_chunk.append(sentences[i])
-            current_length += sentence_lengths[i]
-            i += 1
-        if i < len(sentences):
-            i = i - overlap if i - overlap > start else start
-        chunks.append(' '.join(current_chunk))
-    summaries = []
-    for chunk in chunks:
-        inputs = tokenizer.encode(chunk, return_tensors='pt', max_length=1024, truncation=True).to(device)
-        with torch.no_grad():
-            summary_ids = model.generate(
-                inputs,
-                max_length=desired_max_length // len(chunks),
-                min_length=desired_min_length // len(chunks),
-                num_beams=4,
-                length_penalty=2.0,
-                no_repeat_ngram_size=3,
-                early_stopping=True
-            )
-        summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))
-    return ' '.join(summaries)

+import re
+import torch
+from transformers import BartForConditionalGeneration, BartTokenizer
+from youtube_transcript_api import YouTubeTranscriptApi
+from nltk.tokenize import sent_tokenize
+import nltk
+# Ensure NLTK data is downloaded during the first run
+nltk.download('punkt')
+def clean_text(text):
+    """Clean up text by removing extra whitespace and quotes."""
+    cleaned_text = re.sub(r'\s+', ' ', text)
+    cleaned_text = cleaned_text.replace("'", "")
+    return cleaned_text
+def get_youtube_captions(video_id):
+    """Fetch captions for a YouTube video, translating to English if needed."""
+    try:
+        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
+        full_transcript = ""
+        for transcript in transcript_list:
+            try:
+                english_transcript = transcript.translate('en').fetch()
+                for caption in english_transcript:
+                    full_transcript += caption['text'] + " "
+                break
+            except Exception:
+                continue
+        return clean_text(full_transcript)
+    except Exception as e:
+        print(f"Error fetching captions: {e}")
+        return None
+def summarize_large_text_with_bart(input_text):
+    """Summarize large text using BART model."""
+    model_name = "facebook/bart-large-cnn"
+    # Load tokenizer and model
+    tokenizer = BartTokenizer.from_pretrained(model_name)
+    model = BartForConditionalGeneration.from_pretrained(model_name)
+    # Use GPU if available
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+    # Tokenize input and calculate summary lengths
+    input_tokens = tokenizer.encode(input_text, add_special_tokens=False)
+    total_input_length = len(input_tokens)
+    desired_min_length = int(total_input_length * 0.28)
+    desired_max_length = int(total_input_length * 0.40)
+    # Split input into chunks of <= 1024 tokens with overlap
+    sentences = sent_tokenize(input_text)
+    max_chunk_length = 1024
+    overlap = 2
+    chunks = []
+    sentence_tokens = [tokenizer.encode(sentence, add_special_tokens=False) for sentence in sentences]
+    sentence_lengths = [len(tokens) for tokens in sentence_tokens]
+    i = 0
+    while i < len(sentences):
+        current_chunk = []
+        current_length = 0
+        start = i
+        while i < len(sentences) and current_length + sentence_lengths[i] <= max_chunk_length:
+            current_chunk.append(sentences[i])
+            current_length += sentence_lengths[i]
+            i += 1
+        if i < len(sentences):
+            i = i - overlap if i - overlap > start else start
+        chunks.append(' '.join(current_chunk))
+    # Generate summaries for each chunk
+    summaries = []
+    for chunk in chunks:
+        inputs = tokenizer.encode(chunk, return_tensors='pt', max_length=1024, truncation=True).to(device)
+        with torch.no_grad():
+            summary_ids = model.generate(
+                inputs,
+                max_length=desired_max_length // len(chunks),
+                min_length=desired_min_length // len(chunks),
+                num_beams=4,
+                length_penalty=2.0,
+                no_repeat_ngram_size=3,
+                early_stopping=True
+            )
+        summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))
+    return ' '.join(summaries)