afanyu237 commited on
Commit
f8337ca
Β·
verified Β·
1 Parent(s): 9688c15

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +438 -0
app.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ st.set_page_config(page_title="WhatsApp Chat Analyzer", layout="wide")
3
+
4
+ import pandas as pd
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+ import preprocessor, helper
8
+ from sentiment import predict_sentiment_batch
9
+ import os
10
+ os.environ["STREAMLIT_SERVER_RUN_ON_SAVE"] = "false"
11
+
12
+ # Theme customization
13
+ st.markdown(
14
+ """
15
+ <style>
16
+ .main {background-color: #f0f2f6;}
17
+ </style>
18
+ """,
19
+ unsafe_allow_html=True
20
+ )
21
+
22
+ # Set seaborn style
23
+ sns.set_theme(style="whitegrid")
24
+
25
+ st.title("πŸ“Š WhatsApp Chat Sentiment Analysis Dashboard")
26
+ st.subheader('Instructions')
27
+ st.markdown("1. Open the sidebar and upload your WhatsApp chat file in .txt format.")
28
+ st.markdown("2. Wait for the initial processing (minimal delay).")
29
+ st.markdown("3. Customize the analysis by selecting users or filters.")
30
+ st.markdown("4. Click 'Show Analysis' for detailed results.")
31
+
32
+ st.sidebar.title("Whatsapp Chat Analyzer")
33
+ uploaded_file = st.sidebar.file_uploader("Upload your chat file (.txt)", type="txt")
34
+
35
+ @st.cache_data
36
+ def load_and_preprocess(file_content):
37
+ return preprocessor.preprocess(file_content)
38
+
39
+ if uploaded_file is not None:
40
+ raw_data = uploaded_file.read().decode("utf-8")
41
+ with st.spinner("Loading chat data..."):
42
+ df, _ = load_and_preprocess(raw_data)
43
+ st.session_state.df = df
44
+
45
+ st.sidebar.header("πŸ” Filters")
46
+ user_list = ["Overall"] + sorted(df["user"].unique().tolist())
47
+ selected_user = st.sidebar.selectbox("Select User", user_list)
48
+
49
+ df_filtered = df if selected_user == "Overall" else df[df["user"] == selected_user]
50
+
51
+ if st.sidebar.button("Show Analysis"):
52
+ if df_filtered.empty:
53
+ st.warning(f"No data found for user: {selected_user}")
54
+ else:
55
+ with st.spinner("Analyzing..."):
56
+ if 'sentiment' not in df_filtered.columns:
57
+ try:
58
+ print("Starting sentiment analysis...")
59
+ # Get messages as clean strings
60
+ message_list = df_filtered["message"].astype(str).tolist()
61
+ message_list = [msg for msg in message_list if msg.strip()]
62
+
63
+ print(f"Processing {len(message_list)} messages")
64
+ print(f"Sample messages: {message_list[:5]}")
65
+
66
+ # Directly call the sentiment analysis function
67
+ df_filtered['sentiment'] = predict_sentiment_batch(message_list)
68
+ print("Sentiment analysis completed successfully")
69
+
70
+ except Exception as e:
71
+ st.error(f"Sentiment analysis failed: {str(e)}")
72
+ print(f"Full error: {str(e)}")
73
+
74
+ st.session_state.df_filtered = df_filtered
75
+ else:
76
+ st.session_state.df_filtered = df_filtered
77
+
78
+ # Display statistics and visualizations
79
+ num_messages, words, num_media, num_links = helper.fetch_stats(selected_user, df_filtered)
80
+ st.title("Top Statistics")
81
+ col1, col2, col3, col4 = st.columns(4)
82
+ with col1:
83
+ st.header("Total Messages")
84
+ st.title(num_messages)
85
+ with col2:
86
+ st.header("Total Words")
87
+ st.title(words)
88
+ with col3:
89
+ st.header("Media Shared")
90
+ st.title(num_media)
91
+ with col4:
92
+ st.header("Links Shared")
93
+ st.title(num_links)
94
+
95
+ st.title("Monthly Timeline")
96
+ timeline = helper.monthly_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered))))
97
+ if not timeline.empty:
98
+ plt.figure(figsize=(10, 5))
99
+ sns.lineplot(data=timeline, x='time', y='message', color='green')
100
+ plt.title("Monthly Timeline")
101
+ plt.xlabel("Date")
102
+ plt.ylabel("Messages")
103
+ st.pyplot(plt)
104
+ plt.clf()
105
+
106
+ st.title("Daily Timeline")
107
+ daily_timeline = helper.daily_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered))))
108
+ if not daily_timeline.empty:
109
+ plt.figure(figsize=(10, 5))
110
+ sns.lineplot(data=daily_timeline, x='date', y='message', color='black')
111
+ plt.title("Daily Timeline")
112
+ plt.xlabel("Date")
113
+ plt.ylabel("Messages")
114
+ st.pyplot(plt)
115
+ plt.clf()
116
+
117
+ st.title("Activity Map")
118
+ col1, col2 = st.columns(2)
119
+ with col1:
120
+ st.header("Most Busy Day")
121
+ busy_day = helper.week_activity_map(selected_user, df_filtered)
122
+ if not busy_day.empty:
123
+ plt.figure(figsize=(10, 5))
124
+ sns.barplot(x=busy_day.index, y=busy_day.values, palette="Purples_r")
125
+ plt.title("Most Busy Day")
126
+ plt.xlabel("Day of Week")
127
+ plt.ylabel("Message Count")
128
+ st.pyplot(plt)
129
+ plt.clf()
130
+ with col2:
131
+ st.header("Most Busy Month")
132
+ busy_month = helper.month_activity_map(selected_user, df_filtered)
133
+ if not busy_month.empty:
134
+ plt.figure(figsize=(10, 5))
135
+ sns.barplot(x=busy_month.index, y=busy_month.values, palette="Oranges_r")
136
+ plt.title("Most Busy Month")
137
+ plt.xlabel("Month")
138
+ plt.ylabel("Message Count")
139
+ st.pyplot(plt)
140
+ plt.clf()
141
+
142
+ if selected_user == 'Overall':
143
+ st.title("Most Busy Users")
144
+ x, new_df = helper.most_busy_users(df_filtered)
145
+ if not x.empty:
146
+ plt.figure(figsize=(10, 5))
147
+ sns.barplot(x=x.index, y=x.values, palette="Reds_r")
148
+ plt.title("Most Busy Users")
149
+ plt.xlabel("User")
150
+ plt.ylabel("Message Count")
151
+ plt.xticks(rotation=45)
152
+ st.pyplot(plt)
153
+ st.title("Word Count by User")
154
+ plt.clf()
155
+ st.dataframe(new_df)
156
+
157
+ # Most common words analysis
158
+ st.title("Most Common Words")
159
+ most_common_df = helper.most_common_words(selected_user, df_filtered)
160
+ if not most_common_df.empty:
161
+ fig, ax = plt.subplots(figsize=(10, 6))
162
+ sns.barplot(y=most_common_df[0], x=most_common_df[1], ax=ax, palette="Blues_r")
163
+ ax.set_title("Top 20 Most Common Words")
164
+ ax.set_xlabel("Frequency")
165
+ ax.set_ylabel("Words")
166
+ plt.xticks(rotation='vertical')
167
+ st.pyplot(fig)
168
+ plt.clf()
169
+ else:
170
+ st.warning("No data available for most common words.")
171
+
172
+ # Emoji analysis
173
+ st.title("Emoji Analysis")
174
+ emoji_df = helper.emoji_helper(selected_user, df_filtered)
175
+ if not emoji_df.empty:
176
+ col1, col2 = st.columns(2)
177
+
178
+ with col1:
179
+ st.subheader("Top Emojis Used")
180
+ st.dataframe(emoji_df)
181
+
182
+ with col2:
183
+ fig, ax = plt.subplots(figsize=(8, 8))
184
+ ax.pie(emoji_df[1].head(), labels=emoji_df[0].head(),
185
+ autopct="%0.2f%%", startangle=90,
186
+ colors=sns.color_palette("pastel"))
187
+ ax.set_title("Top Emoji Distribution")
188
+ st.pyplot(fig)
189
+ plt.clf()
190
+ else:
191
+ st.warning("No data available for emoji analysis.")
192
+
193
+ # Sentiment Analysis Visualizations
194
+ st.title("πŸ“ˆ Sentiment Analysis")
195
+
196
+ # Convert month names to abbreviated format
197
+ month_map = {
198
+ 'January': 'Jan', 'February': 'Feb', 'March': 'Mar', 'April': 'Apr',
199
+ 'May': 'May', 'June': 'Jun', 'July': 'Jul', 'August': 'Aug',
200
+ 'September': 'Sep', 'October': 'Oct', 'November': 'Nov', 'December': 'Dec'
201
+ }
202
+ df_filtered['month'] = df_filtered['month'].map(month_map)
203
+
204
+ # Group by month and sentiment
205
+ monthly_sentiment = df_filtered.groupby(['month', 'sentiment']).size().unstack(fill_value=0)
206
+
207
+ # Plotting: Histogram (Bar Chart) for each sentiment
208
+ st.write("### Sentiment Count by Month (Histogram)")
209
+
210
+ # Create a figure with subplots for each sentiment
211
+ fig, axes = plt.subplots(1, 3, figsize=(18, 5))
212
+
213
+ # Plot Positive Sentiment
214
+ if 'positive' in monthly_sentiment:
215
+ axes[0].bar(monthly_sentiment.index, monthly_sentiment['positive'], color='green')
216
+ axes[0].set_title('Positive Sentiment')
217
+ axes[0].set_xlabel('Month')
218
+ axes[0].set_ylabel('Count')
219
+
220
+ # Plot Neutral Sentiment
221
+ if 'neutral' in monthly_sentiment:
222
+ axes[1].bar(monthly_sentiment.index, monthly_sentiment['neutral'], color='blue')
223
+ axes[1].set_title('Neutral Sentiment')
224
+ axes[1].set_xlabel('Month')
225
+ axes[1].set_ylabel('Count')
226
+
227
+ # Plot Negative Sentiment
228
+ if 'negative' in monthly_sentiment:
229
+ axes[2].bar(monthly_sentiment.index, monthly_sentiment['negative'], color='red')
230
+ axes[2].set_title('Negative Sentiment')
231
+ axes[2].set_xlabel('Month')
232
+ axes[2].set_ylabel('Count')
233
+
234
+ # Display the plots in Streamlit
235
+ st.pyplot(fig)
236
+ plt.clf()
237
+
238
+ # Count sentiments per day of the week
239
+ sentiment_counts = df_filtered.groupby(['day_of_week', 'sentiment']).size().unstack(fill_value=0)
240
+
241
+ # Sort days correctly
242
+ day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
243
+ sentiment_counts = sentiment_counts.reindex(day_order)
244
+
245
+ # Daily Sentiment Analysis
246
+ st.write("### Daily Sentiment Analysis")
247
+
248
+ # Create a Matplotlib figure
249
+ fig, ax = plt.subplots(figsize=(10, 5))
250
+ sentiment_counts.plot(kind='bar', stacked=False, ax=ax, color=['red', 'blue', 'green'])
251
+
252
+ # Customize the plot
253
+ ax.set_xlabel("Day of the Week")
254
+ ax.set_ylabel("Count")
255
+ ax.set_title("Sentiment Distribution per Day of the Week")
256
+ ax.legend(title="Sentiment")
257
+
258
+ # Display the plot in Streamlit
259
+ st.pyplot(fig)
260
+ plt.clf()
261
+
262
+ # Count messages per user per sentiment (only for Overall view)
263
+ if selected_user == 'Overall':
264
+ sentiment_counts = df_filtered.groupby(['user', 'sentiment']).size().reset_index(name='Count')
265
+
266
+ # Calculate total messages per sentiment
267
+ total_per_sentiment = df_filtered['sentiment'].value_counts().to_dict()
268
+
269
+ # Add percentage column
270
+ sentiment_counts['Percentage'] = sentiment_counts.apply(
271
+ lambda row: (row['Count'] / total_per_sentiment[row['sentiment']]) * 100, axis=1
272
+ )
273
+
274
+ # Separate tables for each sentiment
275
+ positive_df = sentiment_counts[sentiment_counts['sentiment'] == 'positive'].sort_values(by='Count', ascending=False).head(10)
276
+ neutral_df = sentiment_counts[sentiment_counts['sentiment'] == 'neutral'].sort_values(by='Count', ascending=False).head(10)
277
+ negative_df = sentiment_counts[sentiment_counts['sentiment'] == 'negative'].sort_values(by='Count', ascending=False).head(10)
278
+
279
+ # Sentiment Contribution Analysis
280
+ st.write("### Sentiment Contribution by User")
281
+
282
+ # Create three columns for side-by-side display
283
+ col1, col2, col3 = st.columns(3)
284
+
285
+ # Display Positive Table
286
+ with col1:
287
+ st.subheader("Top Positive Contributors")
288
+ if not positive_df.empty:
289
+ st.dataframe(positive_df[['user', 'Count', 'Percentage']])
290
+ else:
291
+ st.warning("No positive sentiment data")
292
+
293
+ # Display Neutral Table
294
+ with col2:
295
+ st.subheader("Top Neutral Contributors")
296
+ if not neutral_df.empty:
297
+ st.dataframe(neutral_df[['user', 'Count', 'Percentage']])
298
+ else:
299
+ st.warning("No neutral sentiment data")
300
+
301
+ # Display Negative Table
302
+ with col3:
303
+ st.subheader("Top Negative Contributors")
304
+ if not negative_df.empty:
305
+ st.dataframe(negative_df[['user', 'Count', 'Percentage']])
306
+ else:
307
+ st.warning("No negative sentiment data")
308
+
309
+ # Topic Analysis Section
310
+ st.title("πŸ” Area of Focus: Topic Analysis")
311
+
312
+ # Check if topic column exists, otherwise perform topic modeling
313
+ # if 'topic' not in df_filtered.columns:
314
+ # with st.spinner("Performing topic modeling..."):
315
+ # try:
316
+ # # Add topic modeling here or ensure your helper functions handle it
317
+ # df_filtered = helper.perform_topic_modeling(df_filtered)
318
+ # except Exception as e:
319
+ # st.error(f"Topic modeling failed: {str(e)}")
320
+ # st.stop()
321
+
322
+ # Plot Topic Distribution
323
+ st.header("Topic Distribution")
324
+ try:
325
+ fig = helper.plot_topic_distribution(df_filtered)
326
+ st.pyplot(fig)
327
+ plt.clf()
328
+ except Exception as e:
329
+ st.warning(f"Could not display topic distribution: {str(e)}")
330
+
331
+ # Display Sample Messages for Each Topic
332
+ st.header("Sample Messages for Each Topic")
333
+ if 'topic' in df_filtered.columns:
334
+ for topic_id in sorted(df_filtered['topic'].unique()):
335
+ st.subheader(f"Topic {topic_id}")
336
+
337
+ # Get messages for the current topic
338
+ filtered_messages = df_filtered[df_filtered['topic'] == topic_id]['message']
339
+
340
+ # Determine sample size
341
+ sample_size = min(5, len(filtered_messages))
342
+
343
+ if sample_size > 0:
344
+ sample_messages = filtered_messages.sample(sample_size, replace=False).tolist()
345
+ for msg in sample_messages:
346
+ st.write(f"- {msg}")
347
+ else:
348
+ st.write("No messages available for this topic.")
349
+ else:
350
+ st.warning("Topic information not available")
351
+
352
+ # Topic Distribution Over Time
353
+ st.header("πŸ“… Topic Trends Over Time")
354
+
355
+ # Add time frequency selector
356
+ time_freq = st.selectbox("Select Time Frequency", ["Daily", "Weekly", "Monthly"], key='time_freq')
357
+
358
+ # Plot topic trends
359
+ try:
360
+ freq_map = {"Daily": "D", "Weekly": "W", "Monthly": "M"}
361
+ topic_distribution = helper.topic_distribution_over_time(df_filtered, time_freq=freq_map[time_freq])
362
+
363
+ # Choose between static and interactive plot
364
+ use_plotly = st.checkbox("Use interactive visualization", value=True, key='use_plotly')
365
+
366
+ if use_plotly:
367
+ fig = helper.plot_topic_distribution_over_time_plotly(topic_distribution)
368
+ st.plotly_chart(fig, use_container_width=True)
369
+ else:
370
+ fig = helper.plot_topic_distribution_over_time(topic_distribution)
371
+ st.pyplot(fig)
372
+ plt.clf()
373
+ except Exception as e:
374
+ st.warning(f"Could not display topic trends: {str(e)}")
375
+
376
+ # Clustering Analysis Section
377
+ st.title("🧩 Conversation Clusters")
378
+
379
+ # Number of clusters input
380
+ n_clusters = st.slider("Select number of clusters",
381
+ min_value=2,
382
+ max_value=10,
383
+ value=5,
384
+ key='n_clusters')
385
+
386
+ # Perform clustering
387
+ with st.spinner("Analyzing conversation clusters..."):
388
+ try:
389
+ df_clustered, reduced_features, _ = preprocessor.preprocess_for_clustering(df_filtered, n_clusters=n_clusters)
390
+
391
+ # Plot clusters
392
+ st.header("Cluster Visualization")
393
+ fig = helper.plot_clusters(reduced_features, df_clustered['cluster'])
394
+ st.pyplot(fig)
395
+ plt.clf()
396
+
397
+ # Cluster Insights
398
+ st.header("πŸ“Œ Cluster Insights")
399
+
400
+ # 1. Dominant Conversation Themes
401
+ st.subheader("1. Dominant Themes")
402
+ cluster_labels = helper.get_cluster_labels(df_clustered, n_clusters)
403
+ for cluster_id, label in cluster_labels.items():
404
+ st.write(f"**Cluster {cluster_id}**: {label}")
405
+
406
+ # 2. Temporal Patterns
407
+ st.subheader("2. Temporal Patterns")
408
+ temporal_trends = helper.get_temporal_trends(df_clustered)
409
+ for cluster_id, trend in temporal_trends.items():
410
+ st.write(f"**Cluster {cluster_id}**: Peaks on {trend['peak_day']} around {trend['peak_time']}")
411
+
412
+ # 3. User Contributions
413
+ if selected_user == 'Overall':
414
+ st.subheader("3. Top Contributors")
415
+ user_contributions = helper.get_user_contributions(df_clustered)
416
+ for cluster_id, users in user_contributions.items():
417
+ st.write(f"**Cluster {cluster_id}**: {', '.join(users[:3])}...")
418
+
419
+ # 4. Sentiment by Cluster
420
+ st.subheader("4. Sentiment Analysis")
421
+ sentiment_by_cluster = helper.get_sentiment_by_cluster(df_clustered)
422
+ for cluster_id, sentiment in sentiment_by_cluster.items():
423
+ st.write(f"**Cluster {cluster_id}**: {sentiment['positive']}% positive, {sentiment['neutral']}% neutral, {sentiment['negative']}% negative")
424
+
425
+ # Sample messages from each cluster
426
+ st.subheader("Sample Messages")
427
+ for cluster_id in sorted(df_clustered['cluster'].unique()):
428
+ with st.expander(f"Cluster {cluster_id} Messages"):
429
+ cluster_msgs = df_clustered[df_clustered['cluster'] == cluster_id]['message']
430
+ sample_size = min(3, len(cluster_msgs))
431
+ if sample_size > 0:
432
+ for msg in cluster_msgs.sample(sample_size, replace=False):
433
+ st.write(f"- {msg}")
434
+ else:
435
+ st.write("No messages available")
436
+
437
+ except Exception as e:
438
+ st.error(f"Clustering failed: {str(e)}")