File size: 5,187 Bytes
6b62c63
684b28f
6b62c63
 
 
684b28f
6b62c63
684b28f
6b62c63
 
 
 
 
 
472b6fe
6b62c63
 
 
 
684b28f
6b62c63
 
 
684b28f
6b62c63
684b28f
6b62c63
 
 
 
 
 
 
 
 
 
684b28f
6b62c63
 
 
 
 
 
 
684b28f
6b62c63
 
 
 
684b28f
6b62c63
 
 
 
 
 
684b28f
6b62c63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
684b28f
6b62c63
 
 
 
 
 
 
684b28f
6b62c63
684b28f
6b62c63
 
 
 
 
 
 
 
 
684b28f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import spacy
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
from googleapiclient.discovery import build
from fpdf import FPDF
import re
from wordcloud import WordCloud

# Initialize Spacy and VADER
nlp = spacy.load("en_core_web_sm")
sia = SentimentIntensityAnalyzer()

# YouTube Data API key
YOUTUBE_API_KEY = "YOUR_API_KEY"

def fetch_video_metadata(video_url):
    video_id = video_url.split('v=')[-1]
    youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)

    try:
        request = youtube.videos().list(part="snippet,statistics", id=video_id)
        response = request.execute()

        video_data = response['items'][0]

        metadata = {
            "channel_name": video_data['snippet']['channelTitle'],
            "video_title": video_data['snippet']['title'],
            "views": video_data['statistics']['viewCount'],
            "likes": video_data['statistics'].get('likeCount', 'N/A'),
            "dislikes": video_data['statistics'].get('dislikeCount', 'N/A'),
            "posted_date": video_data['snippet']['publishedAt']
        }

        return metadata, None

    except VideoUnavailable:
        return None, "Video is unavailable."
    except Exception as e:
        return None, str(e)

def fetch_transcript(video_url):
    video_id = video_url.split('v=')[-1]

    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        text = " ".join([t['text'] for t in transcript])
        return text, None

    except (TranscriptsDisabled, VideoUnavailable):
        return None, "Transcript not available for this video."
    except Exception as e:
        return None, str(e)

def split_long_sentences(text):
    doc = nlp(text)
    sentences = []
    for sent in doc.sents:
        if len(sent.text.split()) > 25:
            sub_sentences = []
            current_chunk = []
            for token in sent:
                current_chunk.append(token.text)
                if token.is_punct and token.text in {".", "!", "?"}:
                    sub_sentences.append(" ".join(current_chunk).strip())
                    current_chunk = []
                elif token.text.lower() in {"and", "but", "because", "so"}:
                    if len(current_chunk) > 3:
                        sub_sentences.append(" ".join(current_chunk).strip())
                        current_chunk = []

            if current_chunk:
                sub_sentences.append(" ".join(current_chunk).strip())

            sentences.extend(sub_sentences)
        else:
            sentences.append(sent.text.strip())

    return sentences

def read_keywords(file_path):
    df = pd.read_excel(file_path.name)  # Use file_path.name since it's a Gradio file object
    attributes = df.columns.tolist()
    keywords = {attribute: df[attribute].dropna().tolist() for attribute in attributes}
    return keywords, attributes

def match_keywords_in_sentences(sentences, keywords):
    matched_keywords = {attribute: [] for attribute in keywords}
    for sentence in sentences:
        for attribute, sub_keywords in keywords.items():
            for keyword in sub_keywords:
                if keyword.lower() in sentence.lower():
                    matched_keywords[attribute].append(sentence)
    return matched_keywords

def analyze_sentiment_for_keywords(matched_keywords, sentences):
    sentiment_results = {attribute: [] for attribute in matched_keywords}
    for attribute, matched_sentences in matched_keywords.items():
        for sentence in matched_sentences:
            sentiment_score = sia.polarity_scores(sentence)["compound"]
            sentiment_results[attribute].append({"sentence": sentence, "score": sentiment_score})
    return sentiment_results

def generate_word_clouds(matched_keywords):
    wordclouds = {attribute: WordCloud().generate(" ".join(sentences)) for attribute, sentences in matched_keywords.items()}
    return wordclouds

def generate_pdf_with_sections(metadata, sentiment_results, wordclouds):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    # Add metadata to PDF
    pdf.cell(200, 10, txt=f"Video Title: {metadata['video_title']}", ln=True)
    pdf.cell(200, 10, txt=f"Channel: {metadata['channel_name']}", ln=True)
    pdf.cell(200, 10, txt=f"Posted Date: {metadata['posted_date']}", ln=True)
    pdf.cell(200, 10, txt=f"Views: {metadata['views']}", ln=True)

    # Add Sentiment Analysis Results
    for attribute, sentiments in sentiment_results.items():
        pdf.cell(200, 10, txt=f"\nSentiments for {attribute}:", ln=True)
        for sentiment in sentiments:
            pdf.cell(200, 10, txt=f"  - {sentiment['sentence']} [Score: {sentiment['score']}]", ln=True)

    # Generate Wordclouds
    for attribute, wordcloud in wordclouds.items():
        wordcloud_image_path = f"{attribute}_wordcloud.png"
        wordcloud.to_file(wordcloud_image_path)
        pdf.add_page()
        pdf.image(wordcloud_image_path, x=10, y=10, w=180)

    output_pdf_path = "sentiment_report.pdf"
    pdf.output(output_pdf_path)
    return output_pdf_path