Spaces:
Running
Running
File size: 3,653 Bytes
6b62c63 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# backend.py
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
from googleapiclient.discovery import build
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
# Initialize Spacy and VADER
nlp = spacy.load("en_core_web_sm")
sia = SentimentIntensityAnalyzer()
# YouTube Data API key
YOUTUBE_API_KEY = "YOUR_YOUTUBE_API_KEY"
# Fetch metadata of YouTube Video
def fetch_video_metadata(video_url):
video_id = video_url.split('v=')[-1]
youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
try:
request = youtube.videos().list(part="snippet,statistics", id=video_id)
response = request.execute()
video_data = response['items'][0]
metadata = {
"channel_name": video_data['snippet']['channelTitle'],
"video_title": video_data['snippet']['title'],
"views": video_data['statistics']['viewCount'],
"likes": video_data['statistics'].get('likeCount', 'N/A'),
"dislikes": video_data['statistics'].get('dislikeCount', 'N/A'),
"posted_date": video_data['snippet']['publishedAt']
}
return metadata, None
except VideoUnavailable:
return None, "Video is unavailable."
except Exception as e:
return None, str(e)
# Fetch the transcript for YouTube Video
def fetch_transcript(video_url):
video_id = video_url.split('v=')[-1]
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id)
text = " ".join([t['text'] for t in transcript])
return text, None
except (TranscriptsDisabled, VideoUnavailable):
return None, "Transcript not available for this video."
except Exception as e:
return None, str(e)
# Split long sentences into chunks for better processing
def split_long_sentences(text):
doc = nlp(text) # Tokenize into sentences using Spacy
sentences = []
for sent in doc.sents:
if len(sent.text.split()) > 25:
sub_sentences = []
current_chunk = []
for token in sent:
current_chunk.append(token.text)
if token.is_punct and token.text in {".", "!", "?"}:
sub_sentences.append(" ".join(current_chunk).strip())
current_chunk = []
elif token.text.lower() in {"and", "but", "because", "so"}:
if len(current_chunk) > 3:
sub_sentences.append(" ".join(current_chunk).strip())
current_chunk = []
if current_chunk:
sub_sentences.append(" ".join(current_chunk).strip())
sentences.extend(sub_sentences)
else:
sentences.append(sent.text.strip())
return sentences
# Read the keywords from the provided Excel file
def read_keywords(file_path):
df = pd.read_excel(file_path)
attributes = df.columns.tolist()
keywords = {}
for attribute in attributes:
keywords[attribute] = df[attribute].dropna().tolist()
return keywords, attributes
# Match keywords with sentences
def match_keywords_in_sentences(sentences, keywords):
matched_keywords = {attribute: [] for attribute in keywords}
for sentence in sentences:
for attribute, sub_keywords in keywords.items():
for keyword in sub_keywords:
if keyword.lower() in sentence.lower():
matched_keywords[attribute].append(sentence)
return matched_keywords |