murtaza2801 commited on
Commit
c72f236
·
verified ·
1 Parent(s): 72f811b

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +150 -148
utils.py CHANGED
@@ -1,148 +1,150 @@
1
- import requests
2
- from bs4 import BeautifulSoup
3
- import time
4
- import nltk
5
- from nltk.sentiment import SentimentIntensityAnalyzer
6
- from nltk.tokenize import word_tokenize
7
- from nltk.corpus import stopwords
8
- from collections import Counter
9
- from gtts import gTTS
10
- import os
11
- import platform
12
-
13
- # Download required NLTK data files (if not already available).
14
- nltk.download('vader_lexicon')
15
- nltk.download('punkt')
16
- nltk.download('averaged_perceptron_tagger')
17
- nltk.download('stopwords')
18
-
19
- def get_bing_news_articles(company_name, num_articles=10):
20
- """
21
- Scrapes Bing News search results for a given company name.
22
- Returns a list of articles with metadata: title, summary, URL, and source.
23
- """
24
- query = company_name.replace(" ", "+")
25
- url = f"https://www.bing.com/news/search?q={query}&FORM=HDRSC6"
26
- headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
27
- response = requests.get(url, headers=headers)
28
- if response.status_code != 200:
29
- return []
30
- soup = BeautifulSoup(response.text, "html.parser")
31
- articles = []
32
- news_cards = soup.find_all("div", class_="news-card")
33
- for card in news_cards:
34
- title_tag = card.find("a", class_="title")
35
- if not title_tag:
36
- continue
37
- title = title_tag.get_text(strip=True)
38
- article_url = title_tag.get("href")
39
- snippet_tag = card.find("div", class_="snippet")
40
- snippet = snippet_tag.get_text(strip=True) if snippet_tag else ""
41
- source_tag = card.find("div", class_="source")
42
- source = source_tag.get_text(strip=True) if source_tag else ""
43
- articles.append({
44
- "title": title,
45
- "summary": snippet,
46
- "url": article_url,
47
- "source": source
48
- })
49
- if len(articles) >= num_articles:
50
- break
51
- return articles
52
-
53
- def analyze_sentiment(text):
54
- """
55
- Analyzes the sentiment of the given text using NLTK's VADER.
56
- Returns:
57
- sentiment (str): "Positive", "Negative", or "Neutral"
58
- scores (dict): The full set of polarity scores.
59
- """
60
- sia = SentimentIntensityAnalyzer()
61
- scores = sia.polarity_scores(text)
62
- compound = scores["compound"]
63
- if compound >= 0.05:
64
- sentiment = "Positive"
65
- elif compound <= -0.05:
66
- sentiment = "Negative"
67
- else:
68
- sentiment = "Neutral"
69
- return sentiment, scores
70
-
71
- def extract_topics(text):
72
- """
73
- Extracts topics from the input text using basic noun extraction.
74
- Tokenizes the text, removes stopwords and punctuation, and returns a list of unique nouns.
75
- """
76
- text = text.lower()
77
- tokens = word_tokenize(text)
78
- stop_words = set(stopwords.words("english"))
79
- filtered = [word for word in tokens if word.isalpha() and word not in stop_words]
80
- tagged = nltk.pos_tag(filtered)
81
- nouns = [word for word, pos in tagged if pos in ["NN", "NNS", "NNP", "NNPS"]]
82
- return list(set(nouns))
83
-
84
- def comparative_analysis(articles):
85
- """
86
- Performs comparative analysis across articles.
87
- Returns a dictionary with:
88
- - Sentiment Distribution: Count of articles per sentiment.
89
- - Coverage Differences: Insights based on keyword presence.
90
- - Topic Overlap: Common topics and unique topics per article.
91
- """
92
- sentiment_distribution = {"Positive": 0, "Negative": 0, "Neutral": 0}
93
- sales_keywords = {"sales", "growth", "record", "profit"}
94
- regulatory_keywords = {"regulation", "regulatory", "scrutiny", "lawsuit", "legal", "compliance"}
95
- sales_count = 0
96
- reg_count = 0
97
- all_topics = []
98
- for article in articles:
99
- sentiment = article.get("sentiment", "Neutral")
100
- sentiment_distribution[sentiment] += 1
101
- combined_text = f"{article['title']} {article['summary']}".lower()
102
- if any(keyword in combined_text for keyword in sales_keywords):
103
- sales_count += 1
104
- if any(keyword in combined_text for keyword in regulatory_keywords):
105
- reg_count += 1
106
- topics = extract_topics(combined_text)
107
- article["topics"] = topics
108
- all_topics.extend(topics)
109
- if sales_count > reg_count:
110
- coverage_insight = (f"More articles ({sales_count}) emphasize sales and financial growth compared to regulatory concerns ({reg_count}).")
111
- elif reg_count > sales_count:
112
- coverage_insight = (f"More articles ({reg_count}) focus on regulatory or legal challenges compared to sales aspects ({sales_count}).")
113
- else:
114
- coverage_insight = (f"An equal number of articles emphasize sales/growth and regulatory issues ({sales_count} each).")
115
- topic_counter = Counter(all_topics)
116
- common_topics = [topic for topic, count in topic_counter.items() if count > 1]
117
- unique_topics = {}
118
- for i, article in enumerate(articles, start=1):
119
- unique = [topic for topic in article.get("topics", []) if topic_counter[topic] == 1]
120
- unique_topics[f"Article {i}"] = unique
121
- analysis = {
122
- "Sentiment Distribution": sentiment_distribution,
123
- "Coverage Differences": coverage_insight,
124
- "Topic Overlap": {
125
- "Common Topics": common_topics,
126
- "Unique Topics": unique_topics
127
- }
128
- }
129
- return analysis
130
-
131
- def convert_text_to_hindi_tts(text, output_file="output.mp3"):
132
- """
133
- Converts the input text into Hindi speech using gTTS and saves it as an MP3 file.
134
- """
135
- tts = gTTS(text=text, lang='hi', slow=False)
136
- tts.save(output_file)
137
- return output_file
138
-
139
- def play_audio(file_path):
140
- """
141
- Plays an audio file using the system's default media player.
142
- """
143
- if platform.system() == "Windows":
144
- os.startfile(file_path)
145
- elif platform.system() == "Darwin":
146
- os.system(f"open {file_path}")
147
- else:
148
- os.system(f"mpg123 {file_path}")
 
 
 
1
+ import nltk
2
+ nltk.download('punkt') # Download the required resource
3
+
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ import time
7
+ from nltk.sentiment import SentimentIntensityAnalyzer
8
+ from nltk.tokenize import word_tokenize
9
+ from nltk.corpus import stopwords
10
+ from collections import Counter
11
+ from gtts import gTTS
12
+ import os
13
+ import platform
14
+
15
+ # Download required NLTK data files (if not already available).
16
+ nltk.download('vader_lexicon')
17
+
18
+ nltk.download('averaged_perceptron_tagger')
19
+ nltk.download('stopwords')
20
+
21
+ def get_bing_news_articles(company_name, num_articles=10):
22
+ """
23
+ Scrapes Bing News search results for a given company name.
24
+ Returns a list of articles with metadata: title, summary, URL, and source.
25
+ """
26
+ query = company_name.replace(" ", "+")
27
+ url = f"https://www.bing.com/news/search?q={query}&FORM=HDRSC6"
28
+ headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
29
+ response = requests.get(url, headers=headers)
30
+ if response.status_code != 200:
31
+ return []
32
+ soup = BeautifulSoup(response.text, "html.parser")
33
+ articles = []
34
+ news_cards = soup.find_all("div", class_="news-card")
35
+ for card in news_cards:
36
+ title_tag = card.find("a", class_="title")
37
+ if not title_tag:
38
+ continue
39
+ title = title_tag.get_text(strip=True)
40
+ article_url = title_tag.get("href")
41
+ snippet_tag = card.find("div", class_="snippet")
42
+ snippet = snippet_tag.get_text(strip=True) if snippet_tag else ""
43
+ source_tag = card.find("div", class_="source")
44
+ source = source_tag.get_text(strip=True) if source_tag else ""
45
+ articles.append({
46
+ "title": title,
47
+ "summary": snippet,
48
+ "url": article_url,
49
+ "source": source
50
+ })
51
+ if len(articles) >= num_articles:
52
+ break
53
+ return articles
54
+
55
+ def analyze_sentiment(text):
56
+ """
57
+ Analyzes the sentiment of the given text using NLTK's VADER.
58
+ Returns:
59
+ sentiment (str): "Positive", "Negative", or "Neutral"
60
+ scores (dict): The full set of polarity scores.
61
+ """
62
+ sia = SentimentIntensityAnalyzer()
63
+ scores = sia.polarity_scores(text)
64
+ compound = scores["compound"]
65
+ if compound >= 0.05:
66
+ sentiment = "Positive"
67
+ elif compound <= -0.05:
68
+ sentiment = "Negative"
69
+ else:
70
+ sentiment = "Neutral"
71
+ return sentiment, scores
72
+
73
+ def extract_topics(text):
74
+ """
75
+ Extracts topics from the input text using basic noun extraction.
76
+ Tokenizes the text, removes stopwords and punctuation, and returns a list of unique nouns.
77
+ """
78
+ text = text.lower()
79
+ tokens = word_tokenize(text)
80
+ stop_words = set(stopwords.words("english"))
81
+ filtered = [word for word in tokens if word.isalpha() and word not in stop_words]
82
+ tagged = nltk.pos_tag(filtered)
83
+ nouns = [word for word, pos in tagged if pos in ["NN", "NNS", "NNP", "NNPS"]]
84
+ return list(set(nouns))
85
+
86
+ def comparative_analysis(articles):
87
+ """
88
+ Performs comparative analysis across articles.
89
+ Returns a dictionary with:
90
+ - Sentiment Distribution: Count of articles per sentiment.
91
+ - Coverage Differences: Insights based on keyword presence.
92
+ - Topic Overlap: Common topics and unique topics per article.
93
+ """
94
+ sentiment_distribution = {"Positive": 0, "Negative": 0, "Neutral": 0}
95
+ sales_keywords = {"sales", "growth", "record", "profit"}
96
+ regulatory_keywords = {"regulation", "regulatory", "scrutiny", "lawsuit", "legal", "compliance"}
97
+ sales_count = 0
98
+ reg_count = 0
99
+ all_topics = []
100
+ for article in articles:
101
+ sentiment = article.get("sentiment", "Neutral")
102
+ sentiment_distribution[sentiment] += 1
103
+ combined_text = f"{article['title']} {article['summary']}".lower()
104
+ if any(keyword in combined_text for keyword in sales_keywords):
105
+ sales_count += 1
106
+ if any(keyword in combined_text for keyword in regulatory_keywords):
107
+ reg_count += 1
108
+ topics = extract_topics(combined_text)
109
+ article["topics"] = topics
110
+ all_topics.extend(topics)
111
+ if sales_count > reg_count:
112
+ coverage_insight = (f"More articles ({sales_count}) emphasize sales and financial growth compared to regulatory concerns ({reg_count}).")
113
+ elif reg_count > sales_count:
114
+ coverage_insight = (f"More articles ({reg_count}) focus on regulatory or legal challenges compared to sales aspects ({sales_count}).")
115
+ else:
116
+ coverage_insight = (f"An equal number of articles emphasize sales/growth and regulatory issues ({sales_count} each).")
117
+ topic_counter = Counter(all_topics)
118
+ common_topics = [topic for topic, count in topic_counter.items() if count > 1]
119
+ unique_topics = {}
120
+ for i, article in enumerate(articles, start=1):
121
+ unique = [topic for topic in article.get("topics", []) if topic_counter[topic] == 1]
122
+ unique_topics[f"Article {i}"] = unique
123
+ analysis = {
124
+ "Sentiment Distribution": sentiment_distribution,
125
+ "Coverage Differences": coverage_insight,
126
+ "Topic Overlap": {
127
+ "Common Topics": common_topics,
128
+ "Unique Topics": unique_topics
129
+ }
130
+ }
131
+ return analysis
132
+
133
+ def convert_text_to_hindi_tts(text, output_file="output.mp3"):
134
+ """
135
+ Converts the input text into Hindi speech using gTTS and saves it as an MP3 file.
136
+ """
137
+ tts = gTTS(text=text, lang='hi', slow=False)
138
+ tts.save(output_file)
139
+ return output_file
140
+
141
+ def play_audio(file_path):
142
+ """
143
+ Plays an audio file using the system's default media player.
144
+ """
145
+ if platform.system() == "Windows":
146
+ os.startfile(file_path)
147
+ elif platform.system() == "Darwin":
148
+ os.system(f"open {file_path}")
149
+ else:
150
+ os.system(f"mpg123 {file_path}")