temo12 commited on
Commit
6b62c63
·
verified ·
1 Parent(s): cd3b063

Create backend.py

Browse files
Files changed (1) hide show
  1. backend.py +104 -0
backend.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # backend.py
3
+
4
+ import spacy
5
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
6
+ from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
7
+ from googleapiclient.discovery import build
8
+ import pandas as pd
9
+ from wordcloud import WordCloud
10
+ import matplotlib.pyplot as plt
11
+ import re
12
+
13
+ # Initialize Spacy and VADER
14
+ nlp = spacy.load("en_core_web_sm")
15
+ sia = SentimentIntensityAnalyzer()
16
+
17
+ # YouTube Data API key
18
+ YOUTUBE_API_KEY = "YOUR_YOUTUBE_API_KEY"
19
+
20
+ # Fetch metadata of YouTube Video
21
+ def fetch_video_metadata(video_url):
22
+ video_id = video_url.split('v=')[-1]
23
+ youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
24
+
25
+ try:
26
+ request = youtube.videos().list(part="snippet,statistics", id=video_id)
27
+ response = request.execute()
28
+
29
+ video_data = response['items'][0]
30
+ metadata = {
31
+ "channel_name": video_data['snippet']['channelTitle'],
32
+ "video_title": video_data['snippet']['title'],
33
+ "views": video_data['statistics']['viewCount'],
34
+ "likes": video_data['statistics'].get('likeCount', 'N/A'),
35
+ "dislikes": video_data['statistics'].get('dislikeCount', 'N/A'),
36
+ "posted_date": video_data['snippet']['publishedAt']
37
+ }
38
+
39
+ return metadata, None
40
+ except VideoUnavailable:
41
+ return None, "Video is unavailable."
42
+ except Exception as e:
43
+ return None, str(e)
44
+
45
+ # Fetch the transcript for YouTube Video
46
+ def fetch_transcript(video_url):
47
+ video_id = video_url.split('v=')[-1]
48
+ try:
49
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
50
+ text = " ".join([t['text'] for t in transcript])
51
+ return text, None
52
+ except (TranscriptsDisabled, VideoUnavailable):
53
+ return None, "Transcript not available for this video."
54
+ except Exception as e:
55
+ return None, str(e)
56
+
57
+ # Split long sentences into chunks for better processing
58
+ def split_long_sentences(text):
59
+ doc = nlp(text) # Tokenize into sentences using Spacy
60
+ sentences = []
61
+
62
+ for sent in doc.sents:
63
+ if len(sent.text.split()) > 25:
64
+ sub_sentences = []
65
+ current_chunk = []
66
+ for token in sent:
67
+ current_chunk.append(token.text)
68
+ if token.is_punct and token.text in {".", "!", "?"}:
69
+ sub_sentences.append(" ".join(current_chunk).strip())
70
+ current_chunk = []
71
+ elif token.text.lower() in {"and", "but", "because", "so"}:
72
+ if len(current_chunk) > 3:
73
+ sub_sentences.append(" ".join(current_chunk).strip())
74
+ current_chunk = []
75
+
76
+ if current_chunk:
77
+ sub_sentences.append(" ".join(current_chunk).strip())
78
+ sentences.extend(sub_sentences)
79
+ else:
80
+ sentences.append(sent.text.strip())
81
+
82
+ return sentences
83
+
84
+ # Read the keywords from the provided Excel file
85
+ def read_keywords(file_path):
86
+ df = pd.read_excel(file_path)
87
+
88
+ attributes = df.columns.tolist()
89
+ keywords = {}
90
+
91
+ for attribute in attributes:
92
+ keywords[attribute] = df[attribute].dropna().tolist()
93
+
94
+ return keywords, attributes
95
+
96
+ # Match keywords with sentences
97
+ def match_keywords_in_sentences(sentences, keywords):
98
+ matched_keywords = {attribute: [] for attribute in keywords}
99
+ for sentence in sentences:
100
+ for attribute, sub_keywords in keywords.items():
101
+ for keyword in sub_keywords:
102
+ if keyword.lower() in sentence.lower():
103
+ matched_keywords[attribute].append(sentence)
104
+ return matched_keywords