# backend.py import spacy from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable from googleapiclient.discovery import build import pandas as pd from wordcloud import WordCloud import matplotlib.pyplot as plt import re # Initialize Spacy and VADER nlp = spacy.load("en_core_web_sm") sia = SentimentIntensityAnalyzer() # YouTube Data API key YOUTUBE_API_KEY = "YOUR_YOUTUBE_API_KEY" # Fetch metadata of YouTube Video def fetch_video_metadata(video_url): video_id = video_url.split('v=')[-1] youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY) try: request = youtube.videos().list(part="snippet,statistics", id=video_id) response = request.execute() video_data = response['items'][0] metadata = { "channel_name": video_data['snippet']['channelTitle'], "video_title": video_data['snippet']['title'], "views": video_data['statistics']['viewCount'], "likes": video_data['statistics'].get('likeCount', 'N/A'), "dislikes": video_data['statistics'].get('dislikeCount', 'N/A'), "posted_date": video_data['snippet']['publishedAt'] } return metadata, None except VideoUnavailable: return None, "Video is unavailable." except Exception as e: return None, str(e) # Fetch the transcript for YouTube Video def fetch_transcript(video_url): video_id = video_url.split('v=')[-1] try: transcript = YouTubeTranscriptApi.get_transcript(video_id) text = " ".join([t['text'] for t in transcript]) return text, None except (TranscriptsDisabled, VideoUnavailable): return None, "Transcript not available for this video." except Exception as e: return None, str(e) # Split long sentences into chunks for better processing def split_long_sentences(text): doc = nlp(text) # Tokenize into sentences using Spacy sentences = [] for sent in doc.sents: if len(sent.text.split()) > 25: sub_sentences = [] current_chunk = [] for token in sent: current_chunk.append(token.text) if token.is_punct and token.text in {".", "!", "?"}: sub_sentences.append(" ".join(current_chunk).strip()) current_chunk = [] elif token.text.lower() in {"and", "but", "because", "so"}: if len(current_chunk) > 3: sub_sentences.append(" ".join(current_chunk).strip()) current_chunk = [] if current_chunk: sub_sentences.append(" ".join(current_chunk).strip()) sentences.extend(sub_sentences) else: sentences.append(sent.text.strip()) return sentences # Read the keywords from the provided Excel file def read_keywords(file_path): df = pd.read_excel(file_path) attributes = df.columns.tolist() keywords = {} for attribute in attributes: keywords[attribute] = df[attribute].dropna().tolist() return keywords, attributes # Match keywords with sentences def match_keywords_in_sentences(sentences, keywords): matched_keywords = {attribute: [] for attribute in keywords} for sentence in sentences: for attribute, sub_keywords in keywords.items(): for keyword in sub_keywords: if keyword.lower() in sentence.lower(): matched_keywords[attribute].append(sentence) return matched_keywords