Spaces:
Configuration error
Configuration error
Update utils.py
Browse files
utils.py
CHANGED
|
@@ -1,148 +1,150 @@
|
|
| 1 |
-
import
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
import
|
| 5 |
-
from
|
| 6 |
-
|
| 7 |
-
from nltk.
|
| 8 |
-
from
|
| 9 |
-
from
|
| 10 |
-
import
|
| 11 |
-
import
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
nltk.download('
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
"
|
| 47 |
-
"
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
sentiment = "
|
| 67 |
-
|
| 68 |
-
sentiment = "
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
text
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
if any(keyword in combined_text for keyword in
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
coverage_insight = (f"More articles ({
|
| 113 |
-
|
| 114 |
-
coverage_insight = (f"
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
for
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
"
|
| 123 |
-
|
| 124 |
-
"
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
"""
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
"""
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
os.
|
| 147 |
-
|
| 148 |
-
os.system(f"
|
|
|
|
|
|
|
|
|
| 1 |
+
import nltk
|
| 2 |
+
nltk.download('punkt') # Download the required resource
|
| 3 |
+
|
| 4 |
+
import requests
|
| 5 |
+
from bs4 import BeautifulSoup
|
| 6 |
+
import time
|
| 7 |
+
from nltk.sentiment import SentimentIntensityAnalyzer
|
| 8 |
+
from nltk.tokenize import word_tokenize
|
| 9 |
+
from nltk.corpus import stopwords
|
| 10 |
+
from collections import Counter
|
| 11 |
+
from gtts import gTTS
|
| 12 |
+
import os
|
| 13 |
+
import platform
|
| 14 |
+
|
| 15 |
+
# Download required NLTK data files (if not already available).
|
| 16 |
+
nltk.download('vader_lexicon')
|
| 17 |
+
|
| 18 |
+
nltk.download('averaged_perceptron_tagger')
|
| 19 |
+
nltk.download('stopwords')
|
| 20 |
+
|
| 21 |
+
def get_bing_news_articles(company_name, num_articles=10):
|
| 22 |
+
"""
|
| 23 |
+
Scrapes Bing News search results for a given company name.
|
| 24 |
+
Returns a list of articles with metadata: title, summary, URL, and source.
|
| 25 |
+
"""
|
| 26 |
+
query = company_name.replace(" ", "+")
|
| 27 |
+
url = f"https://www.bing.com/news/search?q={query}&FORM=HDRSC6"
|
| 28 |
+
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
|
| 29 |
+
response = requests.get(url, headers=headers)
|
| 30 |
+
if response.status_code != 200:
|
| 31 |
+
return []
|
| 32 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 33 |
+
articles = []
|
| 34 |
+
news_cards = soup.find_all("div", class_="news-card")
|
| 35 |
+
for card in news_cards:
|
| 36 |
+
title_tag = card.find("a", class_="title")
|
| 37 |
+
if not title_tag:
|
| 38 |
+
continue
|
| 39 |
+
title = title_tag.get_text(strip=True)
|
| 40 |
+
article_url = title_tag.get("href")
|
| 41 |
+
snippet_tag = card.find("div", class_="snippet")
|
| 42 |
+
snippet = snippet_tag.get_text(strip=True) if snippet_tag else ""
|
| 43 |
+
source_tag = card.find("div", class_="source")
|
| 44 |
+
source = source_tag.get_text(strip=True) if source_tag else ""
|
| 45 |
+
articles.append({
|
| 46 |
+
"title": title,
|
| 47 |
+
"summary": snippet,
|
| 48 |
+
"url": article_url,
|
| 49 |
+
"source": source
|
| 50 |
+
})
|
| 51 |
+
if len(articles) >= num_articles:
|
| 52 |
+
break
|
| 53 |
+
return articles
|
| 54 |
+
|
| 55 |
+
def analyze_sentiment(text):
|
| 56 |
+
"""
|
| 57 |
+
Analyzes the sentiment of the given text using NLTK's VADER.
|
| 58 |
+
Returns:
|
| 59 |
+
sentiment (str): "Positive", "Negative", or "Neutral"
|
| 60 |
+
scores (dict): The full set of polarity scores.
|
| 61 |
+
"""
|
| 62 |
+
sia = SentimentIntensityAnalyzer()
|
| 63 |
+
scores = sia.polarity_scores(text)
|
| 64 |
+
compound = scores["compound"]
|
| 65 |
+
if compound >= 0.05:
|
| 66 |
+
sentiment = "Positive"
|
| 67 |
+
elif compound <= -0.05:
|
| 68 |
+
sentiment = "Negative"
|
| 69 |
+
else:
|
| 70 |
+
sentiment = "Neutral"
|
| 71 |
+
return sentiment, scores
|
| 72 |
+
|
| 73 |
+
def extract_topics(text):
|
| 74 |
+
"""
|
| 75 |
+
Extracts topics from the input text using basic noun extraction.
|
| 76 |
+
Tokenizes the text, removes stopwords and punctuation, and returns a list of unique nouns.
|
| 77 |
+
"""
|
| 78 |
+
text = text.lower()
|
| 79 |
+
tokens = word_tokenize(text)
|
| 80 |
+
stop_words = set(stopwords.words("english"))
|
| 81 |
+
filtered = [word for word in tokens if word.isalpha() and word not in stop_words]
|
| 82 |
+
tagged = nltk.pos_tag(filtered)
|
| 83 |
+
nouns = [word for word, pos in tagged if pos in ["NN", "NNS", "NNP", "NNPS"]]
|
| 84 |
+
return list(set(nouns))
|
| 85 |
+
|
| 86 |
+
def comparative_analysis(articles):
|
| 87 |
+
"""
|
| 88 |
+
Performs comparative analysis across articles.
|
| 89 |
+
Returns a dictionary with:
|
| 90 |
+
- Sentiment Distribution: Count of articles per sentiment.
|
| 91 |
+
- Coverage Differences: Insights based on keyword presence.
|
| 92 |
+
- Topic Overlap: Common topics and unique topics per article.
|
| 93 |
+
"""
|
| 94 |
+
sentiment_distribution = {"Positive": 0, "Negative": 0, "Neutral": 0}
|
| 95 |
+
sales_keywords = {"sales", "growth", "record", "profit"}
|
| 96 |
+
regulatory_keywords = {"regulation", "regulatory", "scrutiny", "lawsuit", "legal", "compliance"}
|
| 97 |
+
sales_count = 0
|
| 98 |
+
reg_count = 0
|
| 99 |
+
all_topics = []
|
| 100 |
+
for article in articles:
|
| 101 |
+
sentiment = article.get("sentiment", "Neutral")
|
| 102 |
+
sentiment_distribution[sentiment] += 1
|
| 103 |
+
combined_text = f"{article['title']} {article['summary']}".lower()
|
| 104 |
+
if any(keyword in combined_text for keyword in sales_keywords):
|
| 105 |
+
sales_count += 1
|
| 106 |
+
if any(keyword in combined_text for keyword in regulatory_keywords):
|
| 107 |
+
reg_count += 1
|
| 108 |
+
topics = extract_topics(combined_text)
|
| 109 |
+
article["topics"] = topics
|
| 110 |
+
all_topics.extend(topics)
|
| 111 |
+
if sales_count > reg_count:
|
| 112 |
+
coverage_insight = (f"More articles ({sales_count}) emphasize sales and financial growth compared to regulatory concerns ({reg_count}).")
|
| 113 |
+
elif reg_count > sales_count:
|
| 114 |
+
coverage_insight = (f"More articles ({reg_count}) focus on regulatory or legal challenges compared to sales aspects ({sales_count}).")
|
| 115 |
+
else:
|
| 116 |
+
coverage_insight = (f"An equal number of articles emphasize sales/growth and regulatory issues ({sales_count} each).")
|
| 117 |
+
topic_counter = Counter(all_topics)
|
| 118 |
+
common_topics = [topic for topic, count in topic_counter.items() if count > 1]
|
| 119 |
+
unique_topics = {}
|
| 120 |
+
for i, article in enumerate(articles, start=1):
|
| 121 |
+
unique = [topic for topic in article.get("topics", []) if topic_counter[topic] == 1]
|
| 122 |
+
unique_topics[f"Article {i}"] = unique
|
| 123 |
+
analysis = {
|
| 124 |
+
"Sentiment Distribution": sentiment_distribution,
|
| 125 |
+
"Coverage Differences": coverage_insight,
|
| 126 |
+
"Topic Overlap": {
|
| 127 |
+
"Common Topics": common_topics,
|
| 128 |
+
"Unique Topics": unique_topics
|
| 129 |
+
}
|
| 130 |
+
}
|
| 131 |
+
return analysis
|
| 132 |
+
|
| 133 |
+
def convert_text_to_hindi_tts(text, output_file="output.mp3"):
|
| 134 |
+
"""
|
| 135 |
+
Converts the input text into Hindi speech using gTTS and saves it as an MP3 file.
|
| 136 |
+
"""
|
| 137 |
+
tts = gTTS(text=text, lang='hi', slow=False)
|
| 138 |
+
tts.save(output_file)
|
| 139 |
+
return output_file
|
| 140 |
+
|
| 141 |
+
def play_audio(file_path):
|
| 142 |
+
"""
|
| 143 |
+
Plays an audio file using the system's default media player.
|
| 144 |
+
"""
|
| 145 |
+
if platform.system() == "Windows":
|
| 146 |
+
os.startfile(file_path)
|
| 147 |
+
elif platform.system() == "Darwin":
|
| 148 |
+
os.system(f"open {file_path}")
|
| 149 |
+
else:
|
| 150 |
+
os.system(f"mpg123 {file_path}")
|