Spaces:

murtaza2801
/

Summarization_Audio

Sleeping

App Files Files Community

Summarization_Audio / utils.py

murtaza2801

Upload 2 files

ad80a31 verified 4 months ago

raw

history blame contribute delete

5.58 kB

	import nltk
	nltk.download('punkt') # Download the required resource

	import requests
	from bs4 import BeautifulSoup
	import time
	from nltk.sentiment import SentimentIntensityAnalyzer
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	from collections import Counter
	from gtts import gTTS
	import os
	import platform

	# Download required NLTK data files.
	nltk.download('vader_lexicon')

	nltk.download('averaged_perceptron_tagger')
	nltk.download('stopwords')

	def get_bing_news_articles(company_name, num_articles=10):
	"""
	Scrapes Bing News search results for a given company name.
	"""
	query = company_name.replace(" ", "+")
	url = f"https://www.bing.com/news/search?q={query}&FORM=HDRSC6"
	headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
	response = requests.get(url, headers=headers)
	if response.status_code != 200:
	return []
	soup = BeautifulSoup(response.text, "html.parser")
	articles = []
	news_cards = soup.find_all("div", class_="news-card")
	for card in news_cards:
	title_tag = card.find("a", class_="title")
	if not title_tag:
	continue
	title = title_tag.get_text(strip=True)
	article_url = title_tag.get("href")
	snippet_tag = card.find("div", class_="snippet")
	snippet = snippet_tag.get_text(strip=True) if snippet_tag else ""
	source_tag = card.find("div", class_="source")
	source = source_tag.get_text(strip=True) if source_tag else ""
	articles.append({
	"title": title,
	"summary": snippet,
	"url": article_url,
	"source": source
	})
	if len(articles) >= num_articles:
	break
	return articles

	def analyze_sentiment(text):
	"""
	Analyzes the sentiment of the given text using NLTK's VADER.
	Returns:
	sentiment (str): "Positive", "Negative", or "Neutral"
	scores (dict): The full set of polarity scores.
	"""
	sia = SentimentIntensityAnalyzer()
	scores = sia.polarity_scores(text)
	compound = scores["compound"]
	if compound >= 0.05:
	sentiment = "Positive"
	elif compound <= -0.05:
	sentiment = "Negative"
	else:
	sentiment = "Neutral"
	return sentiment, scores

	def extract_topics(text):
	"""
	Extracts topics from the input text using basic noun extraction.
	Tokenizes the text, removes stopwords and punctuation, and returns a list of unique nouns.
	"""
	text = text.lower()
	tokens = word_tokenize(text)
	stop_words = set(stopwords.words("english"))
	filtered = [word for word in tokens if word.isalpha() and word not in stop_words]
	tagged = nltk.pos_tag(filtered)
	nouns = [word for word, pos in tagged if pos in ["NN", "NNS", "NNP", "NNPS"]]
	return list(set(nouns))

	def comparative_analysis(articles):
	"""
	Performs comparative analysis across articles.
	"""
	sentiment_distribution = {"Positive": 0, "Negative": 0, "Neutral": 0}
	sales_keywords = {"sales", "growth", "record", "profit"}
	regulatory_keywords = {"regulation", "regulatory", "scrutiny", "lawsuit", "legal", "compliance"}
	sales_count = 0
	reg_count = 0
	all_topics = []
	for article in articles:
	sentiment = article.get("sentiment", "Neutral")
	sentiment_distribution[sentiment] += 1
	combined_text = f"{article['title']} {article['summary']}".lower()
	if any(keyword in combined_text for keyword in sales_keywords):
	sales_count += 1
	if any(keyword in combined_text for keyword in regulatory_keywords):
	reg_count += 1
	topics = extract_topics(combined_text)
	article["topics"] = topics
	all_topics.extend(topics)
	if sales_count > reg_count:
	coverage_insight = (f"More articles ({sales_count}) emphasize sales and financial growth compared to regulatory concerns ({reg_count}).")
	elif reg_count > sales_count:
	coverage_insight = (f"More articles ({reg_count}) focus on regulatory or legal challenges compared to sales aspects ({sales_count}).")
	else:
	coverage_insight = (f"An equal number of articles emphasize sales/growth and regulatory issues ({sales_count} each).")
	topic_counter = Counter(all_topics)
	common_topics = [topic for topic, count in topic_counter.items() if count > 1]
	unique_topics = {}
	for i, article in enumerate(articles, start=1):
	unique = [topic for topic in article.get("topics", []) if topic_counter[topic] == 1]
	unique_topics[f"Article {i}"] = unique
	analysis = {
	"Sentiment Distribution": sentiment_distribution,
	"Coverage Differences": coverage_insight,
	"Topic Overlap": {
	"Common Topics": common_topics,
	"Unique Topics": unique_topics
	}
	}
	return analysis

	def convert_text_to_hindi_tts(text, output_file="output.mp3"):
	"""
	Converts the input text into Hindi speech using gTTS and saves it as an MP3 file.
	"""
	tts = gTTS(text=text, lang='hi', slow=False)
	tts.save(output_file)
	return output_file

	def play_audio(file_path):
	"""
	Plays an audio file using the system's default media player.
	"""
	if platform.system() == "Windows":
	os.startfile(file_path)
	elif platform.system() == "Darwin":
	os.system(f"open {file_path}")
	else:
	os.system(f"mpg123 {file_path}")