Spaces:
Sleeping
Sleeping
Create backend.py
Browse files- backend.py +104 -0
backend.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# backend.py
|
3 |
+
|
4 |
+
import spacy
|
5 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
6 |
+
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
|
7 |
+
from googleapiclient.discovery import build
|
8 |
+
import pandas as pd
|
9 |
+
from wordcloud import WordCloud
|
10 |
+
import matplotlib.pyplot as plt
|
11 |
+
import re
|
12 |
+
|
13 |
+
# Initialize Spacy and VADER
|
14 |
+
nlp = spacy.load("en_core_web_sm")
|
15 |
+
sia = SentimentIntensityAnalyzer()
|
16 |
+
|
17 |
+
# YouTube Data API key
|
18 |
+
YOUTUBE_API_KEY = "YOUR_YOUTUBE_API_KEY"
|
19 |
+
|
20 |
+
# Fetch metadata of YouTube Video
|
21 |
+
def fetch_video_metadata(video_url):
|
22 |
+
video_id = video_url.split('v=')[-1]
|
23 |
+
youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
|
24 |
+
|
25 |
+
try:
|
26 |
+
request = youtube.videos().list(part="snippet,statistics", id=video_id)
|
27 |
+
response = request.execute()
|
28 |
+
|
29 |
+
video_data = response['items'][0]
|
30 |
+
metadata = {
|
31 |
+
"channel_name": video_data['snippet']['channelTitle'],
|
32 |
+
"video_title": video_data['snippet']['title'],
|
33 |
+
"views": video_data['statistics']['viewCount'],
|
34 |
+
"likes": video_data['statistics'].get('likeCount', 'N/A'),
|
35 |
+
"dislikes": video_data['statistics'].get('dislikeCount', 'N/A'),
|
36 |
+
"posted_date": video_data['snippet']['publishedAt']
|
37 |
+
}
|
38 |
+
|
39 |
+
return metadata, None
|
40 |
+
except VideoUnavailable:
|
41 |
+
return None, "Video is unavailable."
|
42 |
+
except Exception as e:
|
43 |
+
return None, str(e)
|
44 |
+
|
45 |
+
# Fetch the transcript for YouTube Video
|
46 |
+
def fetch_transcript(video_url):
|
47 |
+
video_id = video_url.split('v=')[-1]
|
48 |
+
try:
|
49 |
+
transcript = YouTubeTranscriptApi.get_transcript(video_id)
|
50 |
+
text = " ".join([t['text'] for t in transcript])
|
51 |
+
return text, None
|
52 |
+
except (TranscriptsDisabled, VideoUnavailable):
|
53 |
+
return None, "Transcript not available for this video."
|
54 |
+
except Exception as e:
|
55 |
+
return None, str(e)
|
56 |
+
|
57 |
+
# Split long sentences into chunks for better processing
|
58 |
+
def split_long_sentences(text):
|
59 |
+
doc = nlp(text) # Tokenize into sentences using Spacy
|
60 |
+
sentences = []
|
61 |
+
|
62 |
+
for sent in doc.sents:
|
63 |
+
if len(sent.text.split()) > 25:
|
64 |
+
sub_sentences = []
|
65 |
+
current_chunk = []
|
66 |
+
for token in sent:
|
67 |
+
current_chunk.append(token.text)
|
68 |
+
if token.is_punct and token.text in {".", "!", "?"}:
|
69 |
+
sub_sentences.append(" ".join(current_chunk).strip())
|
70 |
+
current_chunk = []
|
71 |
+
elif token.text.lower() in {"and", "but", "because", "so"}:
|
72 |
+
if len(current_chunk) > 3:
|
73 |
+
sub_sentences.append(" ".join(current_chunk).strip())
|
74 |
+
current_chunk = []
|
75 |
+
|
76 |
+
if current_chunk:
|
77 |
+
sub_sentences.append(" ".join(current_chunk).strip())
|
78 |
+
sentences.extend(sub_sentences)
|
79 |
+
else:
|
80 |
+
sentences.append(sent.text.strip())
|
81 |
+
|
82 |
+
return sentences
|
83 |
+
|
84 |
+
# Read the keywords from the provided Excel file
|
85 |
+
def read_keywords(file_path):
|
86 |
+
df = pd.read_excel(file_path)
|
87 |
+
|
88 |
+
attributes = df.columns.tolist()
|
89 |
+
keywords = {}
|
90 |
+
|
91 |
+
for attribute in attributes:
|
92 |
+
keywords[attribute] = df[attribute].dropna().tolist()
|
93 |
+
|
94 |
+
return keywords, attributes
|
95 |
+
|
96 |
+
# Match keywords with sentences
|
97 |
+
def match_keywords_in_sentences(sentences, keywords):
|
98 |
+
matched_keywords = {attribute: [] for attribute in keywords}
|
99 |
+
for sentence in sentences:
|
100 |
+
for attribute, sub_keywords in keywords.items():
|
101 |
+
for keyword in sub_keywords:
|
102 |
+
if keyword.lower() in sentence.lower():
|
103 |
+
matched_keywords[attribute].append(sentence)
|
104 |
+
return matched_keywords
|