Spaces:

temo12
/

Auto-Insight

Running

File size: 3,653 Bytes

6b62c63


# backend.py

import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
from googleapiclient.discovery import build
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re

# Initialize Spacy and VADER
nlp = spacy.load("en_core_web_sm")
sia = SentimentIntensityAnalyzer()

# YouTube Data API key
YOUTUBE_API_KEY = "YOUR_YOUTUBE_API_KEY"

# Fetch metadata of YouTube Video
def fetch_video_metadata(video_url):
    video_id = video_url.split('v=')[-1]
    youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
    
    try:
        request = youtube.videos().list(part="snippet,statistics", id=video_id)
        response = request.execute()
        
        video_data = response['items'][0]
        metadata = {
            "channel_name": video_data['snippet']['channelTitle'],
            "video_title": video_data['snippet']['title'],
            "views": video_data['statistics']['viewCount'],
            "likes": video_data['statistics'].get('likeCount', 'N/A'),
            "dislikes": video_data['statistics'].get('dislikeCount', 'N/A'),
            "posted_date": video_data['snippet']['publishedAt']
        }

        return metadata, None
    except VideoUnavailable:
        return None, "Video is unavailable."
    except Exception as e:
        return None, str(e)

# Fetch the transcript for YouTube Video
def fetch_transcript(video_url):
    video_id = video_url.split('v=')[-1]
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        text = " ".join([t['text'] for t in transcript])
        return text, None
    except (TranscriptsDisabled, VideoUnavailable):
        return None, "Transcript not available for this video."
    except Exception as e:
        return None, str(e)

# Split long sentences into chunks for better processing
def split_long_sentences(text):
    doc = nlp(text)  # Tokenize into sentences using Spacy
    sentences = []

    for sent in doc.sents:
        if len(sent.text.split()) > 25:
            sub_sentences = []
            current_chunk = []
            for token in sent:
                current_chunk.append(token.text)
                if token.is_punct and token.text in {".", "!", "?"}:
                    sub_sentences.append(" ".join(current_chunk).strip())
                    current_chunk = []
                elif token.text.lower() in {"and", "but", "because", "so"}:
                    if len(current_chunk) > 3:
                        sub_sentences.append(" ".join(current_chunk).strip())
                        current_chunk = []

            if current_chunk:
                sub_sentences.append(" ".join(current_chunk).strip())
            sentences.extend(sub_sentences)
        else:
            sentences.append(sent.text.strip())

    return sentences

# Read the keywords from the provided Excel file
def read_keywords(file_path):
    df = pd.read_excel(file_path)

    attributes = df.columns.tolist()
    keywords = {}

    for attribute in attributes:
        keywords[attribute] = df[attribute].dropna().tolist()

    return keywords, attributes

# Match keywords with sentences
def match_keywords_in_sentences(sentences, keywords):
    matched_keywords = {attribute: [] for attribute in keywords}
    for sentence in sentences:
        for attribute, sub_keywords in keywords.items():
            for keyword in sub_keywords:
                if keyword.lower() in sentence.lower():
                    matched_keywords[attribute].append(sentence)
    return matched_keywords