murtaza2801 commited on
Commit
5811516
·
verified ·
1 Parent(s): bc33717

Upload 3 files

Browse files
Files changed (3) hide show
  1. api.py +35 -0
  2. app.py +79 -0
  3. utils.py +148 -0
api.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.responses import FileResponse
3
+ import time
4
+ from utils import (
5
+ get_bing_news_articles,
6
+ analyze_sentiment,
7
+ extract_topics,
8
+ comparative_analysis,
9
+ convert_text_to_hindi_tts,
10
+ )
11
+
12
+ app = FastAPI(title="News Summarization & TTS API")
13
+
14
+ @app.get("/news")
15
+ def get_news(company: str, num_articles: int = 10):
16
+ articles = get_bing_news_articles(company, num_articles=num_articles)
17
+ if not articles:
18
+ raise HTTPException(status_code=404, detail="No articles found.")
19
+ for article in articles:
20
+ combined_text = article["title"]
21
+ if article["summary"]:
22
+ combined_text += ". " + article["summary"]
23
+ sentiment, scores = analyze_sentiment(combined_text)
24
+ article["sentiment"] = sentiment
25
+ article["sentiment_scores"] = scores
26
+ article["topics"] = extract_topics(combined_text)
27
+ time.sleep(0.5)
28
+ analysis = comparative_analysis(articles)
29
+ return {"articles": articles, "analysis": analysis}
30
+
31
+ @app.get("/tts")
32
+ def get_tts(text: str):
33
+ output_file = "output.mp3"
34
+ convert_text_to_hindi_tts(text, output_file=output_file)
35
+ return FileResponse(output_file, media_type="audio/mpeg", filename=output_file)
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import time
3
+ from utils import (
4
+ get_bing_news_articles,
5
+ analyze_sentiment,
6
+ extract_topics,
7
+ comparative_analysis,
8
+ convert_text_to_hindi_tts,
9
+ )
10
+ from collections import Counter
11
+ from googletrans import Translator
12
+
13
+ # Initialize the translator.
14
+ translator = Translator()
15
+
16
+ st.title("News Summarization & Sentiment Analysis with Automatic Hindi Translation & TTS")
17
+ st.write("Enter a company name to fetch news articles, analyze sentiment, and generate a final summary automatically converted to Hindi.")
18
+
19
+ company = st.text_input("Company Name", "Tesla")
20
+
21
+ if st.button("Generate Report"):
22
+ with st.spinner("Fetching news articles..."):
23
+ articles = get_bing_news_articles(company, num_articles=10)
24
+
25
+ if not articles:
26
+ st.error("No articles found or there was an error fetching the articles.")
27
+ else:
28
+ # Process each article: perform sentiment analysis.
29
+ for article in articles:
30
+ combined_text = article["title"]
31
+ if article["summary"]:
32
+ combined_text += ". " + article["summary"]
33
+ sentiment, scores = analyze_sentiment(combined_text)
34
+ article["sentiment"] = sentiment
35
+ article["sentiment_scores"] = scores
36
+ # Topics are extracted for internal analysis but not used in the final summary.
37
+ article["topics"] = extract_topics(combined_text)
38
+ time.sleep(0.5)
39
+
40
+ # Display extracted articles.
41
+ st.subheader("Extracted Articles")
42
+ for idx, article in enumerate(articles, start=1):
43
+ st.markdown(f"**Article {idx}:**")
44
+ st.write("Title:", article["title"])
45
+ st.write("Summary:", article["summary"])
46
+ st.write("Source:", article["source"])
47
+ st.write("URL:", article["url"])
48
+ st.write("Sentiment:", article["sentiment"])
49
+ st.markdown("---")
50
+
51
+ # Perform comparative analysis for internal metrics.
52
+ analysis = comparative_analysis(articles)
53
+ st.subheader("Comparative Analysis")
54
+ st.write("**Sentiment Distribution:**", analysis["Sentiment Distribution"])
55
+ st.write("**Coverage Differences:**", analysis["Coverage Differences"])
56
+
57
+ # Create a final summary report in English.
58
+ total_articles = len(articles)
59
+ dist = analysis["Sentiment Distribution"]
60
+ final_summary_en = (
61
+ f"Out of a total of {total_articles} articles, {dist.get('Positive', 0)} articles are positive, "
62
+ f"{dist.get('Negative', 0)} are negative, and {dist.get('Neutral', 0)} are neutral. "
63
+ "Many articles emphasize sales growth and financial development, while some discuss regulatory challenges and legal issues. "
64
+ "Overall, the news coverage of the company is predominantly positive, suggesting potential market growth."
65
+ )
66
+
67
+ # Automatically translate the final summary to Hindi.
68
+ translation = translator.translate(final_summary_en, dest='hi')
69
+ final_summary_hi = translation.text
70
+
71
+ st.subheader("Final Summary Report (Hindi)")
72
+ st.markdown(final_summary_hi)
73
+
74
+ # Convert the Hindi summary into speech.
75
+ with st.spinner("Generating Hindi TTS audio..."):
76
+ audio_file = convert_text_to_hindi_tts(final_summary_hi, output_file="summary_hi.mp3")
77
+
78
+ st.success("Audio summary generated!")
79
+ st.audio(audio_file)
utils.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import time
4
+ import nltk
5
+ from nltk.sentiment import SentimentIntensityAnalyzer
6
+ from nltk.tokenize import word_tokenize
7
+ from nltk.corpus import stopwords
8
+ from collections import Counter
9
+ from gtts import gTTS
10
+ import os
11
+ import platform
12
+
13
+ # Download required NLTK data files (if not already available).
14
+ nltk.download('vader_lexicon')
15
+ nltk.download('punkt')
16
+ nltk.download('averaged_perceptron_tagger')
17
+ nltk.download('stopwords')
18
+
19
+ def get_bing_news_articles(company_name, num_articles=10):
20
+ """
21
+ Scrapes Bing News search results for a given company name.
22
+ Returns a list of articles with metadata: title, summary, URL, and source.
23
+ """
24
+ query = company_name.replace(" ", "+")
25
+ url = f"https://www.bing.com/news/search?q={query}&FORM=HDRSC6"
26
+ headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
27
+ response = requests.get(url, headers=headers)
28
+ if response.status_code != 200:
29
+ return []
30
+ soup = BeautifulSoup(response.text, "html.parser")
31
+ articles = []
32
+ news_cards = soup.find_all("div", class_="news-card")
33
+ for card in news_cards:
34
+ title_tag = card.find("a", class_="title")
35
+ if not title_tag:
36
+ continue
37
+ title = title_tag.get_text(strip=True)
38
+ article_url = title_tag.get("href")
39
+ snippet_tag = card.find("div", class_="snippet")
40
+ snippet = snippet_tag.get_text(strip=True) if snippet_tag else ""
41
+ source_tag = card.find("div", class_="source")
42
+ source = source_tag.get_text(strip=True) if source_tag else ""
43
+ articles.append({
44
+ "title": title,
45
+ "summary": snippet,
46
+ "url": article_url,
47
+ "source": source
48
+ })
49
+ if len(articles) >= num_articles:
50
+ break
51
+ return articles
52
+
53
+ def analyze_sentiment(text):
54
+ """
55
+ Analyzes the sentiment of the given text using NLTK's VADER.
56
+ Returns:
57
+ sentiment (str): "Positive", "Negative", or "Neutral"
58
+ scores (dict): The full set of polarity scores.
59
+ """
60
+ sia = SentimentIntensityAnalyzer()
61
+ scores = sia.polarity_scores(text)
62
+ compound = scores["compound"]
63
+ if compound >= 0.05:
64
+ sentiment = "Positive"
65
+ elif compound <= -0.05:
66
+ sentiment = "Negative"
67
+ else:
68
+ sentiment = "Neutral"
69
+ return sentiment, scores
70
+
71
+ def extract_topics(text):
72
+ """
73
+ Extracts topics from the input text using basic noun extraction.
74
+ Tokenizes the text, removes stopwords and punctuation, and returns a list of unique nouns.
75
+ """
76
+ text = text.lower()
77
+ tokens = word_tokenize(text)
78
+ stop_words = set(stopwords.words("english"))
79
+ filtered = [word for word in tokens if word.isalpha() and word not in stop_words]
80
+ tagged = nltk.pos_tag(filtered)
81
+ nouns = [word for word, pos in tagged if pos in ["NN", "NNS", "NNP", "NNPS"]]
82
+ return list(set(nouns))
83
+
84
+ def comparative_analysis(articles):
85
+ """
86
+ Performs comparative analysis across articles.
87
+ Returns a dictionary with:
88
+ - Sentiment Distribution: Count of articles per sentiment.
89
+ - Coverage Differences: Insights based on keyword presence.
90
+ - Topic Overlap: Common topics and unique topics per article.
91
+ """
92
+ sentiment_distribution = {"Positive": 0, "Negative": 0, "Neutral": 0}
93
+ sales_keywords = {"sales", "growth", "record", "profit"}
94
+ regulatory_keywords = {"regulation", "regulatory", "scrutiny", "lawsuit", "legal", "compliance"}
95
+ sales_count = 0
96
+ reg_count = 0
97
+ all_topics = []
98
+ for article in articles:
99
+ sentiment = article.get("sentiment", "Neutral")
100
+ sentiment_distribution[sentiment] += 1
101
+ combined_text = f"{article['title']} {article['summary']}".lower()
102
+ if any(keyword in combined_text for keyword in sales_keywords):
103
+ sales_count += 1
104
+ if any(keyword in combined_text for keyword in regulatory_keywords):
105
+ reg_count += 1
106
+ topics = extract_topics(combined_text)
107
+ article["topics"] = topics
108
+ all_topics.extend(topics)
109
+ if sales_count > reg_count:
110
+ coverage_insight = (f"More articles ({sales_count}) emphasize sales and financial growth compared to regulatory concerns ({reg_count}).")
111
+ elif reg_count > sales_count:
112
+ coverage_insight = (f"More articles ({reg_count}) focus on regulatory or legal challenges compared to sales aspects ({sales_count}).")
113
+ else:
114
+ coverage_insight = (f"An equal number of articles emphasize sales/growth and regulatory issues ({sales_count} each).")
115
+ topic_counter = Counter(all_topics)
116
+ common_topics = [topic for topic, count in topic_counter.items() if count > 1]
117
+ unique_topics = {}
118
+ for i, article in enumerate(articles, start=1):
119
+ unique = [topic for topic in article.get("topics", []) if topic_counter[topic] == 1]
120
+ unique_topics[f"Article {i}"] = unique
121
+ analysis = {
122
+ "Sentiment Distribution": sentiment_distribution,
123
+ "Coverage Differences": coverage_insight,
124
+ "Topic Overlap": {
125
+ "Common Topics": common_topics,
126
+ "Unique Topics": unique_topics
127
+ }
128
+ }
129
+ return analysis
130
+
131
+ def convert_text_to_hindi_tts(text, output_file="output.mp3"):
132
+ """
133
+ Converts the input text into Hindi speech using gTTS and saves it as an MP3 file.
134
+ """
135
+ tts = gTTS(text=text, lang='hi', slow=False)
136
+ tts.save(output_file)
137
+ return output_file
138
+
139
+ def play_audio(file_path):
140
+ """
141
+ Plays an audio file using the system's default media player.
142
+ """
143
+ if platform.system() == "Windows":
144
+ os.startfile(file_path)
145
+ elif platform.system() == "Darwin":
146
+ os.system(f"open {file_path}")
147
+ else:
148
+ os.system(f"mpg123 {file_path}")