Spaces:

temo12
/

Auto-Insight

Running

App Files Files Community

Auto-Insight / backend.py

temo12

Create backend.py

6b62c63 verified 6 months ago

raw

history blame

3.65 kB


	# backend.py

	import spacy
	from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
	from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
	from googleapiclient.discovery import build
	import pandas as pd
	from wordcloud import WordCloud
	import matplotlib.pyplot as plt
	import re

	# Initialize Spacy and VADER
	nlp = spacy.load("en_core_web_sm")
	sia = SentimentIntensityAnalyzer()

	# YouTube Data API key
	YOUTUBE_API_KEY = "YOUR_YOUTUBE_API_KEY"

	# Fetch metadata of YouTube Video
	def fetch_video_metadata(video_url):
	video_id = video_url.split('v=')[-1]
	youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)

	try:
	request = youtube.videos().list(part="snippet,statistics", id=video_id)
	response = request.execute()

	video_data = response['items'][0]
	metadata = {
	"channel_name": video_data['snippet']['channelTitle'],
	"video_title": video_data['snippet']['title'],
	"views": video_data['statistics']['viewCount'],
	"likes": video_data['statistics'].get('likeCount', 'N/A'),
	"dislikes": video_data['statistics'].get('dislikeCount', 'N/A'),
	"posted_date": video_data['snippet']['publishedAt']
	}

	return metadata, None
	except VideoUnavailable:
	return None, "Video is unavailable."
	except Exception as e:
	return None, str(e)

	# Fetch the transcript for YouTube Video
	def fetch_transcript(video_url):
	video_id = video_url.split('v=')[-1]
	try:
	transcript = YouTubeTranscriptApi.get_transcript(video_id)
	text = " ".join([t['text'] for t in transcript])
	return text, None
	except (TranscriptsDisabled, VideoUnavailable):
	return None, "Transcript not available for this video."
	except Exception as e:
	return None, str(e)

	# Split long sentences into chunks for better processing
	def split_long_sentences(text):
	doc = nlp(text) # Tokenize into sentences using Spacy
	sentences = []

	for sent in doc.sents:
	if len(sent.text.split()) > 25:
	sub_sentences = []
	current_chunk = []
	for token in sent:
	current_chunk.append(token.text)
	if token.is_punct and token.text in {".", "!", "?"}:
	sub_sentences.append(" ".join(current_chunk).strip())
	current_chunk = []
	elif token.text.lower() in {"and", "but", "because", "so"}:
	if len(current_chunk) > 3:
	sub_sentences.append(" ".join(current_chunk).strip())
	current_chunk = []

	if current_chunk:
	sub_sentences.append(" ".join(current_chunk).strip())
	sentences.extend(sub_sentences)
	else:
	sentences.append(sent.text.strip())

	return sentences

	# Read the keywords from the provided Excel file
	def read_keywords(file_path):
	df = pd.read_excel(file_path)

	attributes = df.columns.tolist()
	keywords = {}

	for attribute in attributes:
	keywords[attribute] = df[attribute].dropna().tolist()

	return keywords, attributes

	# Match keywords with sentences
	def match_keywords_in_sentences(sentences, keywords):
	matched_keywords = {attribute: [] for attribute in keywords}
	for sentence in sentences:
	for attribute, sub_keywords in keywords.items():
	for keyword in sub_keywords:
	if keyword.lower() in sentence.lower():
	matched_keywords[attribute].append(sentence)
	return matched_keywords