Spaces:
Sleeping
Sleeping
import spacy | |
import pandas as pd | |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable | |
from googleapiclient.discovery import build | |
from fpdf import FPDF | |
import re | |
from wordcloud import WordCloud | |
# Initialize Spacy and VADER | |
nlp = spacy.load("en_core_web_sm") | |
sia = SentimentIntensityAnalyzer() | |
# YouTube Data API key | |
YOUTUBE_API_KEY = "YOUR_API_KEY" | |
def fetch_video_metadata(video_url): | |
video_id = video_url.split('v=')[-1] | |
youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY) | |
try: | |
request = youtube.videos().list(part="snippet,statistics", id=video_id) | |
response = request.execute() | |
video_data = response['items'][0] | |
metadata = { | |
"channel_name": video_data['snippet']['channelTitle'], | |
"video_title": video_data['snippet']['title'], | |
"views": video_data['statistics']['viewCount'], | |
"likes": video_data['statistics'].get('likeCount', 'N/A'), | |
"dislikes": video_data['statistics'].get('dislikeCount', 'N/A'), | |
"posted_date": video_data['snippet']['publishedAt'] | |
} | |
return metadata, None | |
except VideoUnavailable: | |
return None, "Video is unavailable." | |
except Exception as e: | |
return None, str(e) | |
def fetch_transcript(video_url): | |
video_id = video_url.split('v=')[-1] | |
try: | |
transcript = YouTubeTranscriptApi.get_transcript(video_id) | |
text = " ".join([t['text'] for t in transcript]) | |
return text, None | |
except (TranscriptsDisabled, VideoUnavailable): | |
return None, "Transcript not available for this video." | |
except Exception as e: | |
return None, str(e) | |
def split_long_sentences(text): | |
doc = nlp(text) | |
sentences = [] | |
for sent in doc.sents: | |
if len(sent.text.split()) > 25: | |
sub_sentences = [] | |
current_chunk = [] | |
for token in sent: | |
current_chunk.append(token.text) | |
if token.is_punct and token.text in {".", "!", "?"}: | |
sub_sentences.append(" ".join(current_chunk).strip()) | |
current_chunk = [] | |
elif token.text.lower() in {"and", "but", "because", "so"}: | |
if len(current_chunk) > 3: | |
sub_sentences.append(" ".join(current_chunk).strip()) | |
current_chunk = [] | |
if current_chunk: | |
sub_sentences.append(" ".join(current_chunk).strip()) | |
sentences.extend(sub_sentences) | |
else: | |
sentences.append(sent.text.strip()) | |
return sentences | |
def read_keywords(file_path): | |
df = pd.read_excel(file_path.name) # Use file_path.name since it's a Gradio file object | |
attributes = df.columns.tolist() | |
keywords = {attribute: df[attribute].dropna().tolist() for attribute in attributes} | |
return keywords, attributes | |
def match_keywords_in_sentences(sentences, keywords): | |
matched_keywords = {attribute: [] for attribute in keywords} | |
for sentence in sentences: | |
for attribute, sub_keywords in keywords.items(): | |
for keyword in sub_keywords: | |
if keyword.lower() in sentence.lower(): | |
matched_keywords[attribute].append(sentence) | |
return matched_keywords | |
def analyze_sentiment_for_keywords(matched_keywords, sentences): | |
sentiment_results = {attribute: [] for attribute in matched_keywords} | |
for attribute, matched_sentences in matched_keywords.items(): | |
for sentence in matched_sentences: | |
sentiment_score = sia.polarity_scores(sentence)["compound"] | |
sentiment_results[attribute].append({"sentence": sentence, "score": sentiment_score}) | |
return sentiment_results | |
def generate_word_clouds(matched_keywords): | |
wordclouds = {attribute: WordCloud().generate(" ".join(sentences)) for attribute, sentences in matched_keywords.items()} | |
return wordclouds | |
def generate_pdf_with_sections(metadata, sentiment_results, wordclouds): | |
pdf = FPDF() | |
pdf.add_page() | |
pdf.set_font("Arial", size=12) | |
# Add metadata to PDF | |
pdf.cell(200, 10, txt=f"Video Title: {metadata['video_title']}", ln=True) | |
pdf.cell(200, 10, txt=f"Channel: {metadata['channel_name']}", ln=True) | |
pdf.cell(200, 10, txt=f"Posted Date: {metadata['posted_date']}", ln=True) | |
pdf.cell(200, 10, txt=f"Views: {metadata['views']}", ln=True) | |
# Add Sentiment Analysis Results | |
for attribute, sentiments in sentiment_results.items(): | |
pdf.cell(200, 10, txt=f"\nSentiments for {attribute}:", ln=True) | |
for sentiment in sentiments: | |
pdf.cell(200, 10, txt=f" - {sentiment['sentence']} [Score: {sentiment['score']}]", ln=True) | |
# Generate Wordclouds | |
for attribute, wordcloud in wordclouds.items(): | |
wordcloud_image_path = f"{attribute}_wordcloud.png" | |
wordcloud.to_file(wordcloud_image_path) | |
pdf.add_page() | |
pdf.image(wordcloud_image_path, x=10, y=10, w=180) | |
output_pdf_path = "sentiment_report.pdf" | |
pdf.output(output_pdf_path) | |
return output_pdf_path |