Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files
app.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
nltk.download('punkt_tab')
|
3 |
+
nltk.download('averaged_perceptron_tagger_eng')
|
4 |
+
import streamlit as st
|
5 |
+
from streamlit_lottie import st_lottie
|
6 |
+
import requests
|
7 |
+
import time
|
8 |
+
from utils import (
|
9 |
+
get_bing_news_articles,
|
10 |
+
analyze_sentiment,
|
11 |
+
extract_topics,
|
12 |
+
comparative_analysis,
|
13 |
+
convert_text_to_hindi_tts,
|
14 |
+
)
|
15 |
+
from collections import Counter
|
16 |
+
# Load Lottie Animation
|
17 |
+
def load_lottie_url(url):
|
18 |
+
r = requests.get(url)
|
19 |
+
if r.status_code != 200:
|
20 |
+
return None
|
21 |
+
return r.json()
|
22 |
+
|
23 |
+
lottie_animation = load_lottie_url("https://lottie.host/d02e4bd8-cd9c-401e-b143-17fc0ad924a8/o2dLZzU9oO.json")
|
24 |
+
|
25 |
+
# UI Layout
|
26 |
+
st_lottie(lottie_animation, height=200)
|
27 |
+
st.markdown("<h1 style='text-align: center; color: #4CAF50;'>Sentiment Analysis Dashboard</h1>", unsafe_allow_html=True)
|
28 |
+
|
29 |
+
|
30 |
+
st.title("News Summarization & Sentiment Analysis with Hindi TTS")
|
31 |
+
st.write("Enter a company name to fetch news articles, analyze sentiment, and generate a Hindi summary.")
|
32 |
+
|
33 |
+
company = st.text_input("Company Name", "Enter Any Company Name")
|
34 |
+
|
35 |
+
if st.button("Generate Report"):
|
36 |
+
with st.spinner("Fetching news articles..."):
|
37 |
+
articles = get_bing_news_articles(company, num_articles=10)
|
38 |
+
|
39 |
+
if not articles:
|
40 |
+
st.error("No articles found or there was an error fetching the articles.")
|
41 |
+
else:
|
42 |
+
# Process each article: perform sentiment analysis.
|
43 |
+
for article in articles:
|
44 |
+
combined_text = article["title"]
|
45 |
+
if article["summary"]:
|
46 |
+
combined_text += ". " + article["summary"]
|
47 |
+
sentiment, scores = analyze_sentiment(combined_text)
|
48 |
+
article["sentiment"] = sentiment
|
49 |
+
article["sentiment_scores"] = scores
|
50 |
+
# Topics are still extracted but not used in the final summary.
|
51 |
+
article["topics"] = extract_topics(combined_text)
|
52 |
+
time.sleep(0.5)
|
53 |
+
|
54 |
+
# Display individual article details.
|
55 |
+
st.subheader("Extracted Articles")
|
56 |
+
for idx, article in enumerate(articles, start=1):
|
57 |
+
st.markdown(f"**Article {idx}:**")
|
58 |
+
st.write("Title:", article["title"])
|
59 |
+
st.write("Summary:", article["summary"])
|
60 |
+
st.write("Source:", article["source"])
|
61 |
+
st.write("URL:", article["url"])
|
62 |
+
st.write("Sentiment:", article["sentiment"])
|
63 |
+
st.markdown("---")
|
64 |
+
|
65 |
+
# Perform comparative analysis for internal metrics (sentiment distribution, coverage insights)
|
66 |
+
analysis = comparative_analysis(articles)
|
67 |
+
st.subheader("Comparative Analysis")
|
68 |
+
st.write("**Sentiment Distribution:**", analysis["Sentiment Distribution"])
|
69 |
+
st.write("**Coverage Differences:**", analysis["Coverage Differences"])
|
70 |
+
|
71 |
+
# Create a final Hindi summary report that aggregates all the articles.
|
72 |
+
total_articles = len(articles)
|
73 |
+
dist = analysis["Sentiment Distribution"]
|
74 |
+
final_summary = (
|
75 |
+
f"कुल {total_articles} लेखों में से, {dist.get('Positive', 0)} लेख सकारात्मक, "
|
76 |
+
f"{dist.get('Negative', 0)} लेख नकारात्मक, और {dist.get('Neutral', 0)} लेख तटस्थ हैं।\n"
|
77 |
+
"कई लेखों में विक्रय में वृद्धि और आर्थिक विकास पर जोर दिया गया है, जबकि कुछ लेखों में नियामकीय चुनौतियाँ और कानूनी मुद्दों पर चर्चा की गई है।\n"
|
78 |
+
"संपूर्ण रूप से, यह रिपोर्ट दर्शाती है कि कंपनी का समाचार कवरेज मुख्य रूप से सकारात्मक है, "
|
79 |
+
"जो संभावित आर्थिक विकास के संकेत देता है।"
|
80 |
+
)
|
81 |
+
|
82 |
+
st.subheader("Final Summary Report")
|
83 |
+
st.markdown(final_summary)
|
84 |
+
|
85 |
+
# Convert the final summary into Hindi speech.
|
86 |
+
with st.spinner("Generating Hindi TTS audio..."):
|
87 |
+
audio_file = convert_text_to_hindi_tts(final_summary, output_file="tesla_summary_hi.mp3")
|
88 |
+
|
89 |
+
st.success("Audio summary generated!")
|
90 |
+
st.audio(audio_file)
|
utils.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
nltk.download('punkt') # Download the required resource
|
3 |
+
|
4 |
+
import requests
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
import time
|
7 |
+
from nltk.sentiment import SentimentIntensityAnalyzer
|
8 |
+
from nltk.tokenize import word_tokenize
|
9 |
+
from nltk.corpus import stopwords
|
10 |
+
from collections import Counter
|
11 |
+
from gtts import gTTS
|
12 |
+
import os
|
13 |
+
import platform
|
14 |
+
|
15 |
+
# Download required NLTK data files.
|
16 |
+
nltk.download('vader_lexicon')
|
17 |
+
|
18 |
+
nltk.download('averaged_perceptron_tagger')
|
19 |
+
nltk.download('stopwords')
|
20 |
+
|
21 |
+
def get_bing_news_articles(company_name, num_articles=10):
|
22 |
+
"""
|
23 |
+
Scrapes Bing News search results for a given company name.
|
24 |
+
"""
|
25 |
+
query = company_name.replace(" ", "+")
|
26 |
+
url = f"https://www.bing.com/news/search?q={query}&FORM=HDRSC6"
|
27 |
+
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
|
28 |
+
response = requests.get(url, headers=headers)
|
29 |
+
if response.status_code != 200:
|
30 |
+
return []
|
31 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
32 |
+
articles = []
|
33 |
+
news_cards = soup.find_all("div", class_="news-card")
|
34 |
+
for card in news_cards:
|
35 |
+
title_tag = card.find("a", class_="title")
|
36 |
+
if not title_tag:
|
37 |
+
continue
|
38 |
+
title = title_tag.get_text(strip=True)
|
39 |
+
article_url = title_tag.get("href")
|
40 |
+
snippet_tag = card.find("div", class_="snippet")
|
41 |
+
snippet = snippet_tag.get_text(strip=True) if snippet_tag else ""
|
42 |
+
source_tag = card.find("div", class_="source")
|
43 |
+
source = source_tag.get_text(strip=True) if source_tag else ""
|
44 |
+
articles.append({
|
45 |
+
"title": title,
|
46 |
+
"summary": snippet,
|
47 |
+
"url": article_url,
|
48 |
+
"source": source
|
49 |
+
})
|
50 |
+
if len(articles) >= num_articles:
|
51 |
+
break
|
52 |
+
return articles
|
53 |
+
|
54 |
+
def analyze_sentiment(text):
|
55 |
+
"""
|
56 |
+
Analyzes the sentiment of the given text using NLTK's VADER.
|
57 |
+
Returns:
|
58 |
+
sentiment (str): "Positive", "Negative", or "Neutral"
|
59 |
+
scores (dict): The full set of polarity scores.
|
60 |
+
"""
|
61 |
+
sia = SentimentIntensityAnalyzer()
|
62 |
+
scores = sia.polarity_scores(text)
|
63 |
+
compound = scores["compound"]
|
64 |
+
if compound >= 0.05:
|
65 |
+
sentiment = "Positive"
|
66 |
+
elif compound <= -0.05:
|
67 |
+
sentiment = "Negative"
|
68 |
+
else:
|
69 |
+
sentiment = "Neutral"
|
70 |
+
return sentiment, scores
|
71 |
+
|
72 |
+
def extract_topics(text):
|
73 |
+
"""
|
74 |
+
Extracts topics from the input text using basic noun extraction.
|
75 |
+
Tokenizes the text, removes stopwords and punctuation, and returns a list of unique nouns.
|
76 |
+
"""
|
77 |
+
text = text.lower()
|
78 |
+
tokens = word_tokenize(text)
|
79 |
+
stop_words = set(stopwords.words("english"))
|
80 |
+
filtered = [word for word in tokens if word.isalpha() and word not in stop_words]
|
81 |
+
tagged = nltk.pos_tag(filtered)
|
82 |
+
nouns = [word for word, pos in tagged if pos in ["NN", "NNS", "NNP", "NNPS"]]
|
83 |
+
return list(set(nouns))
|
84 |
+
|
85 |
+
def comparative_analysis(articles):
|
86 |
+
"""
|
87 |
+
Performs comparative analysis across articles.
|
88 |
+
"""
|
89 |
+
sentiment_distribution = {"Positive": 0, "Negative": 0, "Neutral": 0}
|
90 |
+
sales_keywords = {"sales", "growth", "record", "profit"}
|
91 |
+
regulatory_keywords = {"regulation", "regulatory", "scrutiny", "lawsuit", "legal", "compliance"}
|
92 |
+
sales_count = 0
|
93 |
+
reg_count = 0
|
94 |
+
all_topics = []
|
95 |
+
for article in articles:
|
96 |
+
sentiment = article.get("sentiment", "Neutral")
|
97 |
+
sentiment_distribution[sentiment] += 1
|
98 |
+
combined_text = f"{article['title']} {article['summary']}".lower()
|
99 |
+
if any(keyword in combined_text for keyword in sales_keywords):
|
100 |
+
sales_count += 1
|
101 |
+
if any(keyword in combined_text for keyword in regulatory_keywords):
|
102 |
+
reg_count += 1
|
103 |
+
topics = extract_topics(combined_text)
|
104 |
+
article["topics"] = topics
|
105 |
+
all_topics.extend(topics)
|
106 |
+
if sales_count > reg_count:
|
107 |
+
coverage_insight = (f"More articles ({sales_count}) emphasize sales and financial growth compared to regulatory concerns ({reg_count}).")
|
108 |
+
elif reg_count > sales_count:
|
109 |
+
coverage_insight = (f"More articles ({reg_count}) focus on regulatory or legal challenges compared to sales aspects ({sales_count}).")
|
110 |
+
else:
|
111 |
+
coverage_insight = (f"An equal number of articles emphasize sales/growth and regulatory issues ({sales_count} each).")
|
112 |
+
topic_counter = Counter(all_topics)
|
113 |
+
common_topics = [topic for topic, count in topic_counter.items() if count > 1]
|
114 |
+
unique_topics = {}
|
115 |
+
for i, article in enumerate(articles, start=1):
|
116 |
+
unique = [topic for topic in article.get("topics", []) if topic_counter[topic] == 1]
|
117 |
+
unique_topics[f"Article {i}"] = unique
|
118 |
+
analysis = {
|
119 |
+
"Sentiment Distribution": sentiment_distribution,
|
120 |
+
"Coverage Differences": coverage_insight,
|
121 |
+
"Topic Overlap": {
|
122 |
+
"Common Topics": common_topics,
|
123 |
+
"Unique Topics": unique_topics
|
124 |
+
}
|
125 |
+
}
|
126 |
+
return analysis
|
127 |
+
|
128 |
+
def convert_text_to_hindi_tts(text, output_file="output.mp3"):
|
129 |
+
"""
|
130 |
+
Converts the input text into Hindi speech using gTTS and saves it as an MP3 file.
|
131 |
+
"""
|
132 |
+
tts = gTTS(text=text, lang='hi', slow=False)
|
133 |
+
tts.save(output_file)
|
134 |
+
return output_file
|
135 |
+
|
136 |
+
def play_audio(file_path):
|
137 |
+
"""
|
138 |
+
Plays an audio file using the system's default media player.
|
139 |
+
"""
|
140 |
+
if platform.system() == "Windows":
|
141 |
+
os.startfile(file_path)
|
142 |
+
elif platform.system() == "Darwin":
|
143 |
+
os.system(f"open {file_path}")
|
144 |
+
else:
|
145 |
+
os.system(f"mpg123 {file_path}")
|