Spaces:
Running
Running
# backend.py | |
import spacy | |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable | |
from googleapiclient.discovery import build | |
import pandas as pd | |
from wordcloud import WordCloud | |
import matplotlib.pyplot as plt | |
import re | |
# Initialize Spacy and VADER | |
nlp = spacy.load("en_core_web_sm") | |
sia = SentimentIntensityAnalyzer() | |
# YouTube Data API key | |
YOUTUBE_API_KEY = "YOUR_YOUTUBE_API_KEY" | |
# Fetch metadata of YouTube Video | |
def fetch_video_metadata(video_url): | |
video_id = video_url.split('v=')[-1] | |
youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY) | |
try: | |
request = youtube.videos().list(part="snippet,statistics", id=video_id) | |
response = request.execute() | |
video_data = response['items'][0] | |
metadata = { | |
"channel_name": video_data['snippet']['channelTitle'], | |
"video_title": video_data['snippet']['title'], | |
"views": video_data['statistics']['viewCount'], | |
"likes": video_data['statistics'].get('likeCount', 'N/A'), | |
"dislikes": video_data['statistics'].get('dislikeCount', 'N/A'), | |
"posted_date": video_data['snippet']['publishedAt'] | |
} | |
return metadata, None | |
except VideoUnavailable: | |
return None, "Video is unavailable." | |
except Exception as e: | |
return None, str(e) | |
# Fetch the transcript for YouTube Video | |
def fetch_transcript(video_url): | |
video_id = video_url.split('v=')[-1] | |
try: | |
transcript = YouTubeTranscriptApi.get_transcript(video_id) | |
text = " ".join([t['text'] for t in transcript]) | |
return text, None | |
except (TranscriptsDisabled, VideoUnavailable): | |
return None, "Transcript not available for this video." | |
except Exception as e: | |
return None, str(e) | |
# Split long sentences into chunks for better processing | |
def split_long_sentences(text): | |
doc = nlp(text) # Tokenize into sentences using Spacy | |
sentences = [] | |
for sent in doc.sents: | |
if len(sent.text.split()) > 25: | |
sub_sentences = [] | |
current_chunk = [] | |
for token in sent: | |
current_chunk.append(token.text) | |
if token.is_punct and token.text in {".", "!", "?"}: | |
sub_sentences.append(" ".join(current_chunk).strip()) | |
current_chunk = [] | |
elif token.text.lower() in {"and", "but", "because", "so"}: | |
if len(current_chunk) > 3: | |
sub_sentences.append(" ".join(current_chunk).strip()) | |
current_chunk = [] | |
if current_chunk: | |
sub_sentences.append(" ".join(current_chunk).strip()) | |
sentences.extend(sub_sentences) | |
else: | |
sentences.append(sent.text.strip()) | |
return sentences | |
# Read the keywords from the provided Excel file | |
def read_keywords(file_path): | |
df = pd.read_excel(file_path) | |
attributes = df.columns.tolist() | |
keywords = {} | |
for attribute in attributes: | |
keywords[attribute] = df[attribute].dropna().tolist() | |
return keywords, attributes | |
# Match keywords with sentences | |
def match_keywords_in_sentences(sentences, keywords): | |
matched_keywords = {attribute: [] for attribute in keywords} | |
for sentence in sentences: | |
for attribute, sub_keywords in keywords.items(): | |
for keyword in sub_keywords: | |
if keyword.lower() in sentence.lower(): | |
matched_keywords[attribute].append(sentence) | |
return matched_keywords |