Spaces:
Configuration error
Configuration error
Update utils.py
Browse files
utils.py
CHANGED
@@ -1,148 +1,150 @@
|
|
1 |
-
import
|
2 |
-
|
3 |
-
|
4 |
-
import
|
5 |
-
from
|
6 |
-
|
7 |
-
from nltk.
|
8 |
-
from
|
9 |
-
from
|
10 |
-
import
|
11 |
-
import
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
nltk.download('
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
"
|
47 |
-
"
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
sentiment = "
|
67 |
-
|
68 |
-
sentiment = "
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
text
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
if any(keyword in combined_text for keyword in
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
coverage_insight = (f"More articles ({
|
113 |
-
|
114 |
-
coverage_insight = (f"
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
for
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
"
|
123 |
-
|
124 |
-
"
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
"""
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
"""
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
os.
|
147 |
-
|
148 |
-
os.system(f"
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
nltk.download('punkt') # Download the required resource
|
3 |
+
|
4 |
+
import requests
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
import time
|
7 |
+
from nltk.sentiment import SentimentIntensityAnalyzer
|
8 |
+
from nltk.tokenize import word_tokenize
|
9 |
+
from nltk.corpus import stopwords
|
10 |
+
from collections import Counter
|
11 |
+
from gtts import gTTS
|
12 |
+
import os
|
13 |
+
import platform
|
14 |
+
|
15 |
+
# Download required NLTK data files (if not already available).
|
16 |
+
nltk.download('vader_lexicon')
|
17 |
+
|
18 |
+
nltk.download('averaged_perceptron_tagger')
|
19 |
+
nltk.download('stopwords')
|
20 |
+
|
21 |
+
def get_bing_news_articles(company_name, num_articles=10):
|
22 |
+
"""
|
23 |
+
Scrapes Bing News search results for a given company name.
|
24 |
+
Returns a list of articles with metadata: title, summary, URL, and source.
|
25 |
+
"""
|
26 |
+
query = company_name.replace(" ", "+")
|
27 |
+
url = f"https://www.bing.com/news/search?q={query}&FORM=HDRSC6"
|
28 |
+
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
|
29 |
+
response = requests.get(url, headers=headers)
|
30 |
+
if response.status_code != 200:
|
31 |
+
return []
|
32 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
33 |
+
articles = []
|
34 |
+
news_cards = soup.find_all("div", class_="news-card")
|
35 |
+
for card in news_cards:
|
36 |
+
title_tag = card.find("a", class_="title")
|
37 |
+
if not title_tag:
|
38 |
+
continue
|
39 |
+
title = title_tag.get_text(strip=True)
|
40 |
+
article_url = title_tag.get("href")
|
41 |
+
snippet_tag = card.find("div", class_="snippet")
|
42 |
+
snippet = snippet_tag.get_text(strip=True) if snippet_tag else ""
|
43 |
+
source_tag = card.find("div", class_="source")
|
44 |
+
source = source_tag.get_text(strip=True) if source_tag else ""
|
45 |
+
articles.append({
|
46 |
+
"title": title,
|
47 |
+
"summary": snippet,
|
48 |
+
"url": article_url,
|
49 |
+
"source": source
|
50 |
+
})
|
51 |
+
if len(articles) >= num_articles:
|
52 |
+
break
|
53 |
+
return articles
|
54 |
+
|
55 |
+
def analyze_sentiment(text):
|
56 |
+
"""
|
57 |
+
Analyzes the sentiment of the given text using NLTK's VADER.
|
58 |
+
Returns:
|
59 |
+
sentiment (str): "Positive", "Negative", or "Neutral"
|
60 |
+
scores (dict): The full set of polarity scores.
|
61 |
+
"""
|
62 |
+
sia = SentimentIntensityAnalyzer()
|
63 |
+
scores = sia.polarity_scores(text)
|
64 |
+
compound = scores["compound"]
|
65 |
+
if compound >= 0.05:
|
66 |
+
sentiment = "Positive"
|
67 |
+
elif compound <= -0.05:
|
68 |
+
sentiment = "Negative"
|
69 |
+
else:
|
70 |
+
sentiment = "Neutral"
|
71 |
+
return sentiment, scores
|
72 |
+
|
73 |
+
def extract_topics(text):
|
74 |
+
"""
|
75 |
+
Extracts topics from the input text using basic noun extraction.
|
76 |
+
Tokenizes the text, removes stopwords and punctuation, and returns a list of unique nouns.
|
77 |
+
"""
|
78 |
+
text = text.lower()
|
79 |
+
tokens = word_tokenize(text)
|
80 |
+
stop_words = set(stopwords.words("english"))
|
81 |
+
filtered = [word for word in tokens if word.isalpha() and word not in stop_words]
|
82 |
+
tagged = nltk.pos_tag(filtered)
|
83 |
+
nouns = [word for word, pos in tagged if pos in ["NN", "NNS", "NNP", "NNPS"]]
|
84 |
+
return list(set(nouns))
|
85 |
+
|
86 |
+
def comparative_analysis(articles):
|
87 |
+
"""
|
88 |
+
Performs comparative analysis across articles.
|
89 |
+
Returns a dictionary with:
|
90 |
+
- Sentiment Distribution: Count of articles per sentiment.
|
91 |
+
- Coverage Differences: Insights based on keyword presence.
|
92 |
+
- Topic Overlap: Common topics and unique topics per article.
|
93 |
+
"""
|
94 |
+
sentiment_distribution = {"Positive": 0, "Negative": 0, "Neutral": 0}
|
95 |
+
sales_keywords = {"sales", "growth", "record", "profit"}
|
96 |
+
regulatory_keywords = {"regulation", "regulatory", "scrutiny", "lawsuit", "legal", "compliance"}
|
97 |
+
sales_count = 0
|
98 |
+
reg_count = 0
|
99 |
+
all_topics = []
|
100 |
+
for article in articles:
|
101 |
+
sentiment = article.get("sentiment", "Neutral")
|
102 |
+
sentiment_distribution[sentiment] += 1
|
103 |
+
combined_text = f"{article['title']} {article['summary']}".lower()
|
104 |
+
if any(keyword in combined_text for keyword in sales_keywords):
|
105 |
+
sales_count += 1
|
106 |
+
if any(keyword in combined_text for keyword in regulatory_keywords):
|
107 |
+
reg_count += 1
|
108 |
+
topics = extract_topics(combined_text)
|
109 |
+
article["topics"] = topics
|
110 |
+
all_topics.extend(topics)
|
111 |
+
if sales_count > reg_count:
|
112 |
+
coverage_insight = (f"More articles ({sales_count}) emphasize sales and financial growth compared to regulatory concerns ({reg_count}).")
|
113 |
+
elif reg_count > sales_count:
|
114 |
+
coverage_insight = (f"More articles ({reg_count}) focus on regulatory or legal challenges compared to sales aspects ({sales_count}).")
|
115 |
+
else:
|
116 |
+
coverage_insight = (f"An equal number of articles emphasize sales/growth and regulatory issues ({sales_count} each).")
|
117 |
+
topic_counter = Counter(all_topics)
|
118 |
+
common_topics = [topic for topic, count in topic_counter.items() if count > 1]
|
119 |
+
unique_topics = {}
|
120 |
+
for i, article in enumerate(articles, start=1):
|
121 |
+
unique = [topic for topic in article.get("topics", []) if topic_counter[topic] == 1]
|
122 |
+
unique_topics[f"Article {i}"] = unique
|
123 |
+
analysis = {
|
124 |
+
"Sentiment Distribution": sentiment_distribution,
|
125 |
+
"Coverage Differences": coverage_insight,
|
126 |
+
"Topic Overlap": {
|
127 |
+
"Common Topics": common_topics,
|
128 |
+
"Unique Topics": unique_topics
|
129 |
+
}
|
130 |
+
}
|
131 |
+
return analysis
|
132 |
+
|
133 |
+
def convert_text_to_hindi_tts(text, output_file="output.mp3"):
|
134 |
+
"""
|
135 |
+
Converts the input text into Hindi speech using gTTS and saves it as an MP3 file.
|
136 |
+
"""
|
137 |
+
tts = gTTS(text=text, lang='hi', slow=False)
|
138 |
+
tts.save(output_file)
|
139 |
+
return output_file
|
140 |
+
|
141 |
+
def play_audio(file_path):
|
142 |
+
"""
|
143 |
+
Plays an audio file using the system's default media player.
|
144 |
+
"""
|
145 |
+
if platform.system() == "Windows":
|
146 |
+
os.startfile(file_path)
|
147 |
+
elif platform.system() == "Darwin":
|
148 |
+
os.system(f"open {file_path}")
|
149 |
+
else:
|
150 |
+
os.system(f"mpg123 {file_path}")
|