Spaces:
Sleeping
Sleeping
File size: 5,581 Bytes
ad80a31 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import nltk
nltk.download('punkt') # Download the required resource
import requests
from bs4 import BeautifulSoup
import time
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from gtts import gTTS
import os
import platform
# Download required NLTK data files.
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
def get_bing_news_articles(company_name, num_articles=10):
"""
Scrapes Bing News search results for a given company name.
"""
query = company_name.replace(" ", "+")
url = f"https://www.bing.com/news/search?q={query}&FORM=HDRSC6"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
response = requests.get(url, headers=headers)
if response.status_code != 200:
return []
soup = BeautifulSoup(response.text, "html.parser")
articles = []
news_cards = soup.find_all("div", class_="news-card")
for card in news_cards:
title_tag = card.find("a", class_="title")
if not title_tag:
continue
title = title_tag.get_text(strip=True)
article_url = title_tag.get("href")
snippet_tag = card.find("div", class_="snippet")
snippet = snippet_tag.get_text(strip=True) if snippet_tag else ""
source_tag = card.find("div", class_="source")
source = source_tag.get_text(strip=True) if source_tag else ""
articles.append({
"title": title,
"summary": snippet,
"url": article_url,
"source": source
})
if len(articles) >= num_articles:
break
return articles
def analyze_sentiment(text):
"""
Analyzes the sentiment of the given text using NLTK's VADER.
Returns:
sentiment (str): "Positive", "Negative", or "Neutral"
scores (dict): The full set of polarity scores.
"""
sia = SentimentIntensityAnalyzer()
scores = sia.polarity_scores(text)
compound = scores["compound"]
if compound >= 0.05:
sentiment = "Positive"
elif compound <= -0.05:
sentiment = "Negative"
else:
sentiment = "Neutral"
return sentiment, scores
def extract_topics(text):
"""
Extracts topics from the input text using basic noun extraction.
Tokenizes the text, removes stopwords and punctuation, and returns a list of unique nouns.
"""
text = text.lower()
tokens = word_tokenize(text)
stop_words = set(stopwords.words("english"))
filtered = [word for word in tokens if word.isalpha() and word not in stop_words]
tagged = nltk.pos_tag(filtered)
nouns = [word for word, pos in tagged if pos in ["NN", "NNS", "NNP", "NNPS"]]
return list(set(nouns))
def comparative_analysis(articles):
"""
Performs comparative analysis across articles.
"""
sentiment_distribution = {"Positive": 0, "Negative": 0, "Neutral": 0}
sales_keywords = {"sales", "growth", "record", "profit"}
regulatory_keywords = {"regulation", "regulatory", "scrutiny", "lawsuit", "legal", "compliance"}
sales_count = 0
reg_count = 0
all_topics = []
for article in articles:
sentiment = article.get("sentiment", "Neutral")
sentiment_distribution[sentiment] += 1
combined_text = f"{article['title']} {article['summary']}".lower()
if any(keyword in combined_text for keyword in sales_keywords):
sales_count += 1
if any(keyword in combined_text for keyword in regulatory_keywords):
reg_count += 1
topics = extract_topics(combined_text)
article["topics"] = topics
all_topics.extend(topics)
if sales_count > reg_count:
coverage_insight = (f"More articles ({sales_count}) emphasize sales and financial growth compared to regulatory concerns ({reg_count}).")
elif reg_count > sales_count:
coverage_insight = (f"More articles ({reg_count}) focus on regulatory or legal challenges compared to sales aspects ({sales_count}).")
else:
coverage_insight = (f"An equal number of articles emphasize sales/growth and regulatory issues ({sales_count} each).")
topic_counter = Counter(all_topics)
common_topics = [topic for topic, count in topic_counter.items() if count > 1]
unique_topics = {}
for i, article in enumerate(articles, start=1):
unique = [topic for topic in article.get("topics", []) if topic_counter[topic] == 1]
unique_topics[f"Article {i}"] = unique
analysis = {
"Sentiment Distribution": sentiment_distribution,
"Coverage Differences": coverage_insight,
"Topic Overlap": {
"Common Topics": common_topics,
"Unique Topics": unique_topics
}
}
return analysis
def convert_text_to_hindi_tts(text, output_file="output.mp3"):
"""
Converts the input text into Hindi speech using gTTS and saves it as an MP3 file.
"""
tts = gTTS(text=text, lang='hi', slow=False)
tts.save(output_file)
return output_file
def play_audio(file_path):
"""
Plays an audio file using the system's default media player.
"""
if platform.system() == "Windows":
os.startfile(file_path)
elif platform.system() == "Darwin":
os.system(f"open {file_path}")
else:
os.system(f"mpg123 {file_path}")
|