temo12 commited on
Commit
684b28f
·
verified ·
1 Parent(s): a52005e

Update backend.py

Browse files
Files changed (1) hide show
  1. backend.py +54 -23
backend.py CHANGED
@@ -1,14 +1,11 @@
1
-
2
- # backend.py
3
-
4
  import spacy
 
5
  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
6
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
7
  from googleapiclient.discovery import build
8
- import pandas as pd
9
- from wordcloud import WordCloud
10
- import matplotlib.pyplot as plt
11
  import re
 
12
 
13
  # Initialize Spacy and VADER
14
  nlp = spacy.load("en_core_web_sm")
@@ -17,16 +14,16 @@ sia = SentimentIntensityAnalyzer()
17
  # YouTube Data API key
18
  YOUTUBE_API_KEY = "AIzaSyDUVh0epMGyeAFwaGl2v58tqlwcsIXzAcU"
19
 
20
- # Fetch metadata of YouTube Video
21
  def fetch_video_metadata(video_url):
22
  video_id = video_url.split('v=')[-1]
23
  youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
24
-
25
  try:
26
  request = youtube.videos().list(part="snippet,statistics", id=video_id)
27
  response = request.execute()
28
-
29
  video_data = response['items'][0]
 
30
  metadata = {
31
  "channel_name": video_data['snippet']['channelTitle'],
32
  "video_title": video_data['snippet']['title'],
@@ -37,28 +34,28 @@ def fetch_video_metadata(video_url):
37
  }
38
 
39
  return metadata, None
 
40
  except VideoUnavailable:
41
  return None, "Video is unavailable."
42
  except Exception as e:
43
  return None, str(e)
44
 
45
- # Fetch the transcript for YouTube Video
46
  def fetch_transcript(video_url):
47
  video_id = video_url.split('v=')[-1]
 
48
  try:
49
  transcript = YouTubeTranscriptApi.get_transcript(video_id)
50
  text = " ".join([t['text'] for t in transcript])
51
  return text, None
 
52
  except (TranscriptsDisabled, VideoUnavailable):
53
  return None, "Transcript not available for this video."
54
  except Exception as e:
55
  return None, str(e)
56
 
57
- # Split long sentences into chunks for better processing
58
  def split_long_sentences(text):
59
- doc = nlp(text) # Tokenize into sentences using Spacy
60
  sentences = []
61
-
62
  for sent in doc.sents:
63
  if len(sent.text.split()) > 25:
64
  sub_sentences = []
@@ -75,25 +72,19 @@ def split_long_sentences(text):
75
 
76
  if current_chunk:
77
  sub_sentences.append(" ".join(current_chunk).strip())
 
78
  sentences.extend(sub_sentences)
79
  else:
80
  sentences.append(sent.text.strip())
81
 
82
  return sentences
83
 
84
- # Read the keywords from the provided Excel file
85
  def read_keywords(file_path):
86
- df = pd.read_excel(file_path)
87
-
88
  attributes = df.columns.tolist()
89
- keywords = {}
90
-
91
- for attribute in attributes:
92
- keywords[attribute] = df[attribute].dropna().tolist()
93
-
94
  return keywords, attributes
95
 
96
- # Match keywords with sentences
97
  def match_keywords_in_sentences(sentences, keywords):
98
  matched_keywords = {attribute: [] for attribute in keywords}
99
  for sentence in sentences:
@@ -101,4 +92,44 @@ def match_keywords_in_sentences(sentences, keywords):
101
  for keyword in sub_keywords:
102
  if keyword.lower() in sentence.lower():
103
  matched_keywords[attribute].append(sentence)
104
- return matched_keywords
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import spacy
2
+ import pandas as pd
3
  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
4
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
5
  from googleapiclient.discovery import build
6
+ from fpdf import FPDF
 
 
7
  import re
8
+ from wordcloud import WordCloud
9
 
10
  # Initialize Spacy and VADER
11
  nlp = spacy.load("en_core_web_sm")
 
14
  # YouTube Data API key
15
  YOUTUBE_API_KEY = "AIzaSyDUVh0epMGyeAFwaGl2v58tqlwcsIXzAcU"
16
 
 
17
  def fetch_video_metadata(video_url):
18
  video_id = video_url.split('v=')[-1]
19
  youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
20
+
21
  try:
22
  request = youtube.videos().list(part="snippet,statistics", id=video_id)
23
  response = request.execute()
24
+
25
  video_data = response['items'][0]
26
+
27
  metadata = {
28
  "channel_name": video_data['snippet']['channelTitle'],
29
  "video_title": video_data['snippet']['title'],
 
34
  }
35
 
36
  return metadata, None
37
+
38
  except VideoUnavailable:
39
  return None, "Video is unavailable."
40
  except Exception as e:
41
  return None, str(e)
42
 
 
43
  def fetch_transcript(video_url):
44
  video_id = video_url.split('v=')[-1]
45
+
46
  try:
47
  transcript = YouTubeTranscriptApi.get_transcript(video_id)
48
  text = " ".join([t['text'] for t in transcript])
49
  return text, None
50
+
51
  except (TranscriptsDisabled, VideoUnavailable):
52
  return None, "Transcript not available for this video."
53
  except Exception as e:
54
  return None, str(e)
55
 
 
56
  def split_long_sentences(text):
57
+ doc = nlp(text)
58
  sentences = []
 
59
  for sent in doc.sents:
60
  if len(sent.text.split()) > 25:
61
  sub_sentences = []
 
72
 
73
  if current_chunk:
74
  sub_sentences.append(" ".join(current_chunk).strip())
75
+
76
  sentences.extend(sub_sentences)
77
  else:
78
  sentences.append(sent.text.strip())
79
 
80
  return sentences
81
 
 
82
  def read_keywords(file_path):
83
+ df = pd.read_excel(file_path.name) # Use file_path.name since it's a Gradio file object
 
84
  attributes = df.columns.tolist()
85
+ keywords = {attribute: df[attribute].dropna().tolist() for attribute in attributes}
 
 
 
 
86
  return keywords, attributes
87
 
 
88
  def match_keywords_in_sentences(sentences, keywords):
89
  matched_keywords = {attribute: [] for attribute in keywords}
90
  for sentence in sentences:
 
92
  for keyword in sub_keywords:
93
  if keyword.lower() in sentence.lower():
94
  matched_keywords[attribute].append(sentence)
95
+ return matched_keywords
96
+
97
+ def analyze_sentiment_for_keywords(matched_keywords, sentences):
98
+ sentiment_results = {attribute: [] for attribute in matched_keywords}
99
+ for attribute, matched_sentences in matched_keywords.items():
100
+ for sentence in matched_sentences:
101
+ sentiment_score = sia.polarity_scores(sentence)["compound"]
102
+ sentiment_results[attribute].append({"sentence": sentence, "score": sentiment_score})
103
+ return sentiment_results
104
+
105
+ def generate_word_clouds(matched_keywords):
106
+ wordclouds = {attribute: WordCloud().generate(" ".join(sentences)) for attribute, sentences in matched_keywords.items()}
107
+ return wordclouds
108
+
109
+ def generate_pdf_with_sections(metadata, sentiment_results, wordclouds):
110
+ pdf = FPDF()
111
+ pdf.add_page()
112
+ pdf.set_font("Arial", size=12)
113
+
114
+ # Add metadata to PDF
115
+ pdf.cell(200, 10, txt=f"Video Title: {metadata['video_title']}", ln=True)
116
+ pdf.cell(200, 10, txt=f"Channel: {metadata['channel_name']}", ln=True)
117
+ pdf.cell(200, 10, txt=f"Posted Date: {metadata['posted_date']}", ln=True)
118
+ pdf.cell(200, 10, txt=f"Views: {metadata['views']}", ln=True)
119
+
120
+ # Add Sentiment Analysis Results
121
+ for attribute, sentiments in sentiment_results.items():
122
+ pdf.cell(200, 10, txt=f"\nSentiments for {attribute}:", ln=True)
123
+ for sentiment in sentiments:
124
+ pdf.cell(200, 10, txt=f" - {sentiment['sentence']} [Score: {sentiment['score']}]", ln=True)
125
+
126
+ # Generate Wordclouds
127
+ for attribute, wordcloud in wordclouds.items():
128
+ wordcloud_image_path = f"{attribute}_wordcloud.png"
129
+ wordcloud.to_file(wordcloud_image_path)
130
+ pdf.add_page()
131
+ pdf.image(wordcloud_image_path, x=10, y=10, w=180)
132
+
133
+ output_pdf_path = "sentiment_report.pdf"
134
+ pdf.output(output_pdf_path)
135
+ return output_pdf_path