Spaces:
Sleeping
Sleeping
File size: 5,187 Bytes
6b62c63 684b28f 6b62c63 684b28f 6b62c63 684b28f 6b62c63 472b6fe 6b62c63 684b28f 6b62c63 684b28f 6b62c63 684b28f 6b62c63 684b28f 6b62c63 684b28f 6b62c63 684b28f 6b62c63 684b28f 6b62c63 684b28f 6b62c63 684b28f 6b62c63 684b28f 6b62c63 684b28f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import spacy
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
from googleapiclient.discovery import build
from fpdf import FPDF
import re
from wordcloud import WordCloud
# Initialize Spacy and VADER
nlp = spacy.load("en_core_web_sm")
sia = SentimentIntensityAnalyzer()
# YouTube Data API key
YOUTUBE_API_KEY = "YOUR_API_KEY"
def fetch_video_metadata(video_url):
video_id = video_url.split('v=')[-1]
youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
try:
request = youtube.videos().list(part="snippet,statistics", id=video_id)
response = request.execute()
video_data = response['items'][0]
metadata = {
"channel_name": video_data['snippet']['channelTitle'],
"video_title": video_data['snippet']['title'],
"views": video_data['statistics']['viewCount'],
"likes": video_data['statistics'].get('likeCount', 'N/A'),
"dislikes": video_data['statistics'].get('dislikeCount', 'N/A'),
"posted_date": video_data['snippet']['publishedAt']
}
return metadata, None
except VideoUnavailable:
return None, "Video is unavailable."
except Exception as e:
return None, str(e)
def fetch_transcript(video_url):
video_id = video_url.split('v=')[-1]
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id)
text = " ".join([t['text'] for t in transcript])
return text, None
except (TranscriptsDisabled, VideoUnavailable):
return None, "Transcript not available for this video."
except Exception as e:
return None, str(e)
def split_long_sentences(text):
doc = nlp(text)
sentences = []
for sent in doc.sents:
if len(sent.text.split()) > 25:
sub_sentences = []
current_chunk = []
for token in sent:
current_chunk.append(token.text)
if token.is_punct and token.text in {".", "!", "?"}:
sub_sentences.append(" ".join(current_chunk).strip())
current_chunk = []
elif token.text.lower() in {"and", "but", "because", "so"}:
if len(current_chunk) > 3:
sub_sentences.append(" ".join(current_chunk).strip())
current_chunk = []
if current_chunk:
sub_sentences.append(" ".join(current_chunk).strip())
sentences.extend(sub_sentences)
else:
sentences.append(sent.text.strip())
return sentences
def read_keywords(file_path):
df = pd.read_excel(file_path.name) # Use file_path.name since it's a Gradio file object
attributes = df.columns.tolist()
keywords = {attribute: df[attribute].dropna().tolist() for attribute in attributes}
return keywords, attributes
def match_keywords_in_sentences(sentences, keywords):
matched_keywords = {attribute: [] for attribute in keywords}
for sentence in sentences:
for attribute, sub_keywords in keywords.items():
for keyword in sub_keywords:
if keyword.lower() in sentence.lower():
matched_keywords[attribute].append(sentence)
return matched_keywords
def analyze_sentiment_for_keywords(matched_keywords, sentences):
sentiment_results = {attribute: [] for attribute in matched_keywords}
for attribute, matched_sentences in matched_keywords.items():
for sentence in matched_sentences:
sentiment_score = sia.polarity_scores(sentence)["compound"]
sentiment_results[attribute].append({"sentence": sentence, "score": sentiment_score})
return sentiment_results
def generate_word_clouds(matched_keywords):
wordclouds = {attribute: WordCloud().generate(" ".join(sentences)) for attribute, sentences in matched_keywords.items()}
return wordclouds
def generate_pdf_with_sections(metadata, sentiment_results, wordclouds):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
# Add metadata to PDF
pdf.cell(200, 10, txt=f"Video Title: {metadata['video_title']}", ln=True)
pdf.cell(200, 10, txt=f"Channel: {metadata['channel_name']}", ln=True)
pdf.cell(200, 10, txt=f"Posted Date: {metadata['posted_date']}", ln=True)
pdf.cell(200, 10, txt=f"Views: {metadata['views']}", ln=True)
# Add Sentiment Analysis Results
for attribute, sentiments in sentiment_results.items():
pdf.cell(200, 10, txt=f"\nSentiments for {attribute}:", ln=True)
for sentiment in sentiments:
pdf.cell(200, 10, txt=f" - {sentiment['sentence']} [Score: {sentiment['score']}]", ln=True)
# Generate Wordclouds
for attribute, wordcloud in wordclouds.items():
wordcloud_image_path = f"{attribute}_wordcloud.png"
wordcloud.to_file(wordcloud_image_path)
pdf.add_page()
pdf.image(wordcloud_image_path, x=10, y=10, w=180)
output_pdf_path = "sentiment_report.pdf"
pdf.output(output_pdf_path)
return output_pdf_path |