import spacy import pandas as pd from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable from googleapiclient.discovery import build from fpdf import FPDF import re from wordcloud import WordCloud # Initialize Spacy and VADER nlp = spacy.load("en_core_web_sm") sia = SentimentIntensityAnalyzer() # YouTube Data API key YOUTUBE_API_KEY = "YOUR_API_KEY" def fetch_video_metadata(video_url): video_id = video_url.split('v=')[-1] youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY) try: request = youtube.videos().list(part="snippet,statistics", id=video_id) response = request.execute() video_data = response['items'][0] metadata = { "channel_name": video_data['snippet']['channelTitle'], "video_title": video_data['snippet']['title'], "views": video_data['statistics']['viewCount'], "likes": video_data['statistics'].get('likeCount', 'N/A'), "dislikes": video_data['statistics'].get('dislikeCount', 'N/A'), "posted_date": video_data['snippet']['publishedAt'] } return metadata, None except VideoUnavailable: return None, "Video is unavailable." except Exception as e: return None, str(e) def fetch_transcript(video_url): video_id = video_url.split('v=')[-1] try: transcript = YouTubeTranscriptApi.get_transcript(video_id) text = " ".join([t['text'] for t in transcript]) return text, None except (TranscriptsDisabled, VideoUnavailable): return None, "Transcript not available for this video." except Exception as e: return None, str(e) def split_long_sentences(text): doc = nlp(text) sentences = [] for sent in doc.sents: if len(sent.text.split()) > 25: sub_sentences = [] current_chunk = [] for token in sent: current_chunk.append(token.text) if token.is_punct and token.text in {".", "!", "?"}: sub_sentences.append(" ".join(current_chunk).strip()) current_chunk = [] elif token.text.lower() in {"and", "but", "because", "so"}: if len(current_chunk) > 3: sub_sentences.append(" ".join(current_chunk).strip()) current_chunk = [] if current_chunk: sub_sentences.append(" ".join(current_chunk).strip()) sentences.extend(sub_sentences) else: sentences.append(sent.text.strip()) return sentences def read_keywords(file_path): df = pd.read_excel(file_path.name) # Use file_path.name since it's a Gradio file object attributes = df.columns.tolist() keywords = {attribute: df[attribute].dropna().tolist() for attribute in attributes} return keywords, attributes def match_keywords_in_sentences(sentences, keywords): matched_keywords = {attribute: [] for attribute in keywords} for sentence in sentences: for attribute, sub_keywords in keywords.items(): for keyword in sub_keywords: if keyword.lower() in sentence.lower(): matched_keywords[attribute].append(sentence) return matched_keywords def analyze_sentiment_for_keywords(matched_keywords, sentences): sentiment_results = {attribute: [] for attribute in matched_keywords} for attribute, matched_sentences in matched_keywords.items(): for sentence in matched_sentences: sentiment_score = sia.polarity_scores(sentence)["compound"] sentiment_results[attribute].append({"sentence": sentence, "score": sentiment_score}) return sentiment_results def generate_word_clouds(matched_keywords): wordclouds = {attribute: WordCloud().generate(" ".join(sentences)) for attribute, sentences in matched_keywords.items()} return wordclouds def generate_pdf_with_sections(metadata, sentiment_results, wordclouds): pdf = FPDF() pdf.add_page() pdf.set_font("Arial", size=12) # Add metadata to PDF pdf.cell(200, 10, txt=f"Video Title: {metadata['video_title']}", ln=True) pdf.cell(200, 10, txt=f"Channel: {metadata['channel_name']}", ln=True) pdf.cell(200, 10, txt=f"Posted Date: {metadata['posted_date']}", ln=True) pdf.cell(200, 10, txt=f"Views: {metadata['views']}", ln=True) # Add Sentiment Analysis Results for attribute, sentiments in sentiment_results.items(): pdf.cell(200, 10, txt=f"\nSentiments for {attribute}:", ln=True) for sentiment in sentiments: pdf.cell(200, 10, txt=f" - {sentiment['sentence']} [Score: {sentiment['score']}]", ln=True) # Generate Wordclouds for attribute, wordcloud in wordclouds.items(): wordcloud_image_path = f"{attribute}_wordcloud.png" wordcloud.to_file(wordcloud_image_path) pdf.add_page() pdf.image(wordcloud_image_path, x=10, y=10, w=180) output_pdf_path = "sentiment_report.pdf" pdf.output(output_pdf_path) return output_pdf_path