Spaces:

afanyu237
/

whatsapp

Running

App Files Files Community

afanyu237 commited on Apr 10

Commit

f8337ca

verified ·

1 Parent(s): 9688c15

Create app.py

Browse files

Files changed (1) hide show

app.py +438 -0

app.py ADDED Viewed

	@@ -0,0 +1,438 @@

+import streamlit as st
+st.set_page_config(page_title="WhatsApp Chat Analyzer", layout="wide")
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import preprocessor, helper
+from sentiment import predict_sentiment_batch
+import os
+os.environ["STREAMLIT_SERVER_RUN_ON_SAVE"] = "false"
+# Theme customization
+st.markdown(
+    """
+    <style>
+    .main {background-color: #f0f2f6;}
+    </style>
+    """,
+    unsafe_allow_html=True
+)
+# Set seaborn style
+sns.set_theme(style="whitegrid")
+st.title("📊 WhatsApp Chat Sentiment Analysis Dashboard")
+st.subheader('Instructions')
+st.markdown("1. Open the sidebar and upload your WhatsApp chat file in .txt format.")
+st.markdown("2. Wait for the initial processing (minimal delay).")
+st.markdown("3. Customize the analysis by selecting users or filters.")
+st.markdown("4. Click 'Show Analysis' for detailed results.")
+st.sidebar.title("Whatsapp Chat Analyzer")
+uploaded_file = st.sidebar.file_uploader("Upload your chat file (.txt)", type="txt")
+@st.cache_data
+def load_and_preprocess(file_content):
+    return preprocessor.preprocess(file_content)
+if uploaded_file is not None:
+    raw_data = uploaded_file.read().decode("utf-8")
+    with st.spinner("Loading chat data..."):
+        df, _ = load_and_preprocess(raw_data)
+    st.session_state.df = df
+    st.sidebar.header("🔍 Filters")
+    user_list = ["Overall"] + sorted(df["user"].unique().tolist())
+    selected_user = st.sidebar.selectbox("Select User", user_list)
+    df_filtered = df if selected_user == "Overall" else df[df["user"] == selected_user]
+    if st.sidebar.button("Show Analysis"):
+        if df_filtered.empty:
+            st.warning(f"No data found for user: {selected_user}")
+        else:
+            with st.spinner("Analyzing..."):
+                if 'sentiment' not in df_filtered.columns:
+                    try:
+                        print("Starting sentiment analysis...")
+                        # Get messages as clean strings
+                        message_list = df_filtered["message"].astype(str).tolist()
+                        message_list = [msg for msg in message_list if msg.strip()]
+                        print(f"Processing {len(message_list)} messages")
+                        print(f"Sample messages: {message_list[:5]}")
+                        # Directly call the sentiment analysis function
+                        df_filtered['sentiment'] = predict_sentiment_batch(message_list)
+                        print("Sentiment analysis completed successfully")
+                    except Exception as e:
+                        st.error(f"Sentiment analysis failed: {str(e)}")
+                        print(f"Full error: {str(e)}")
+                    st.session_state.df_filtered = df_filtered
+                else:
+                    st.session_state.df_filtered = df_filtered
+                # Display statistics and visualizations
+                num_messages, words, num_media, num_links = helper.fetch_stats(selected_user, df_filtered)
+                st.title("Top Statistics")
+                col1, col2, col3, col4 = st.columns(4)
+                with col1:
+                    st.header("Total Messages")
+                    st.title(num_messages)
+                with col2:
+                    st.header("Total Words")
+                    st.title(words)
+                with col3:
+                    st.header("Media Shared")
+                    st.title(num_media)
+                with col4:
+                    st.header("Links Shared")
+                    st.title(num_links)
+                st.title("Monthly Timeline")
+                timeline = helper.monthly_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered))))
+                if not timeline.empty:
+                    plt.figure(figsize=(10, 5))
+                    sns.lineplot(data=timeline, x='time', y='message', color='green')
+                    plt.title("Monthly Timeline")
+                    plt.xlabel("Date")
+                    plt.ylabel("Messages")
+                    st.pyplot(plt)
+                    plt.clf()
+                st.title("Daily Timeline")
+                daily_timeline = helper.daily_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered))))
+                if not daily_timeline.empty:
+                    plt.figure(figsize=(10, 5))
+                    sns.lineplot(data=daily_timeline, x='date', y='message', color='black')
+                    plt.title("Daily Timeline")
+                    plt.xlabel("Date")
+                    plt.ylabel("Messages")
+                    st.pyplot(plt)
+                    plt.clf()
+                st.title("Activity Map")
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.header("Most Busy Day")
+                    busy_day = helper.week_activity_map(selected_user, df_filtered)
+                    if not busy_day.empty:
+                        plt.figure(figsize=(10, 5))
+                        sns.barplot(x=busy_day.index, y=busy_day.values, palette="Purples_r")
+                        plt.title("Most Busy Day")
+                        plt.xlabel("Day of Week")
+                        plt.ylabel("Message Count")
+                        st.pyplot(plt)
+                        plt.clf()
+                with col2:
+                    st.header("Most Busy Month")
+                    busy_month = helper.month_activity_map(selected_user, df_filtered)
+                    if not busy_month.empty:
+                        plt.figure(figsize=(10, 5))
+                        sns.barplot(x=busy_month.index, y=busy_month.values, palette="Oranges_r")
+                        plt.title("Most Busy Month")
+                        plt.xlabel("Month")
+                        plt.ylabel("Message Count")
+                        st.pyplot(plt)
+                        plt.clf()
+                if selected_user == 'Overall':
+                    st.title("Most Busy Users")
+                    x, new_df = helper.most_busy_users(df_filtered)
+                    if not x.empty:
+                        plt.figure(figsize=(10, 5))
+                        sns.barplot(x=x.index, y=x.values, palette="Reds_r")
+                        plt.title("Most Busy Users")
+                        plt.xlabel("User")
+                        plt.ylabel("Message Count")
+                        plt.xticks(rotation=45)
+                        st.pyplot(plt)
+                        st.title("Word Count by User")
+                        plt.clf()
+                        st.dataframe(new_df)
+                # Most common words analysis
+                st.title("Most Common Words")
+                most_common_df = helper.most_common_words(selected_user, df_filtered)
+                if not most_common_df.empty:
+                    fig, ax = plt.subplots(figsize=(10, 6))
+                    sns.barplot(y=most_common_df[0], x=most_common_df[1], ax=ax, palette="Blues_r")
+                    ax.set_title("Top 20 Most Common Words")
+                    ax.set_xlabel("Frequency")
+                    ax.set_ylabel("Words")
+                    plt.xticks(rotation='vertical')
+                    st.pyplot(fig)
+                    plt.clf()
+                else:
+                    st.warning("No data available for most common words.")
+                # Emoji analysis
+                st.title("Emoji Analysis")
+                emoji_df = helper.emoji_helper(selected_user, df_filtered)
+                if not emoji_df.empty:
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        st.subheader("Top Emojis Used")
+                        st.dataframe(emoji_df)
+                    with col2:
+                        fig, ax = plt.subplots(figsize=(8, 8))
+                        ax.pie(emoji_df[1].head(), labels=emoji_df[0].head(),
+                              autopct="%0.2f%%", startangle=90,
+                              colors=sns.color_palette("pastel"))
+                        ax.set_title("Top Emoji Distribution")
+                        st.pyplot(fig)
+                        plt.clf()
+                else:
+                    st.warning("No data available for emoji analysis.")
+                # Sentiment Analysis Visualizations
+                st.title("📈 Sentiment Analysis")
+                # Convert month names to abbreviated format
+                month_map = {
+                    'January': 'Jan', 'February': 'Feb', 'March': 'Mar', 'April': 'Apr',
+                    'May': 'May', 'June': 'Jun', 'July': 'Jul', 'August': 'Aug',
+                    'September': 'Sep', 'October': 'Oct', 'November': 'Nov', 'December': 'Dec'
+                }
+                df_filtered['month'] = df_filtered['month'].map(month_map)
+                # Group by month and sentiment
+                monthly_sentiment = df_filtered.groupby(['month', 'sentiment']).size().unstack(fill_value=0)
+                # Plotting: Histogram (Bar Chart) for each sentiment
+                st.write("### Sentiment Count by Month (Histogram)")
+                # Create a figure with subplots for each sentiment
+                fig, axes = plt.subplots(1, 3, figsize=(18, 5))
+                # Plot Positive Sentiment
+                if 'positive' in monthly_sentiment:
+                    axes[0].bar(monthly_sentiment.index, monthly_sentiment['positive'], color='green')
+                axes[0].set_title('Positive Sentiment')
+                axes[0].set_xlabel('Month')
+                axes[0].set_ylabel('Count')
+                # Plot Neutral Sentiment
+                if 'neutral' in monthly_sentiment:
+                    axes[1].bar(monthly_sentiment.index, monthly_sentiment['neutral'], color='blue')
+                axes[1].set_title('Neutral Sentiment')
+                axes[1].set_xlabel('Month')
+                axes[1].set_ylabel('Count')
+                # Plot Negative Sentiment
+                if 'negative' in monthly_sentiment:
+                    axes[2].bar(monthly_sentiment.index, monthly_sentiment['negative'], color='red')
+                axes[2].set_title('Negative Sentiment')
+                axes[2].set_xlabel('Month')
+                axes[2].set_ylabel('Count')
+                # Display the plots in Streamlit
+                st.pyplot(fig)
+                plt.clf()
+                # Count sentiments per day of the week
+                sentiment_counts = df_filtered.groupby(['day_of_week', 'sentiment']).size().unstack(fill_value=0)
+                # Sort days correctly
+                day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
+                sentiment_counts = sentiment_counts.reindex(day_order)
+                # Daily Sentiment Analysis
+                st.write("### Daily Sentiment Analysis")
+                # Create a Matplotlib figure
+                fig, ax = plt.subplots(figsize=(10, 5))
+                sentiment_counts.plot(kind='bar', stacked=False, ax=ax, color=['red', 'blue', 'green'])
+                # Customize the plot
+                ax.set_xlabel("Day of the Week")
+                ax.set_ylabel("Count")
+                ax.set_title("Sentiment Distribution per Day of the Week")
+                ax.legend(title="Sentiment")
+                # Display the plot in Streamlit
+                st.pyplot(fig)
+                plt.clf()
+                # Count messages per user per sentiment (only for Overall view)
+                if selected_user == 'Overall':
+                    sentiment_counts = df_filtered.groupby(['user', 'sentiment']).size().reset_index(name='Count')
+                    # Calculate total messages per sentiment
+                    total_per_sentiment = df_filtered['sentiment'].value_counts().to_dict()
+                    # Add percentage column
+                    sentiment_counts['Percentage'] = sentiment_counts.apply(
+                        lambda row: (row['Count'] / total_per_sentiment[row['sentiment']]) * 100, axis=1
+                    )
+                    # Separate tables for each sentiment
+                    positive_df = sentiment_counts[sentiment_counts['sentiment'] == 'positive'].sort_values(by='Count', ascending=False).head(10)
+                    neutral_df = sentiment_counts[sentiment_counts['sentiment'] == 'neutral'].sort_values(by='Count', ascending=False).head(10)
+                    negative_df = sentiment_counts[sentiment_counts['sentiment'] == 'negative'].sort_values(by='Count', ascending=False).head(10)
+                    # Sentiment Contribution Analysis
+                    st.write("### Sentiment Contribution by User")
+                    # Create three columns for side-by-side display
+                    col1, col2, col3 = st.columns(3)
+                    # Display Positive Table
+                    with col1:
+                        st.subheader("Top Positive Contributors")
+                        if not positive_df.empty:
+                            st.dataframe(positive_df[['user', 'Count', 'Percentage']])
+                        else:
+                            st.warning("No positive sentiment data")
+                    # Display Neutral Table
+                    with col2:
+                        st.subheader("Top Neutral Contributors")
+                        if not neutral_df.empty:
+                            st.dataframe(neutral_df[['user', 'Count', 'Percentage']])
+                        else:
+                            st.warning("No neutral sentiment data")
+                    # Display Negative Table
+                    with col3:
+                        st.subheader("Top Negative Contributors")
+                        if not negative_df.empty:
+                            st.dataframe(negative_df[['user', 'Count', 'Percentage']])
+                        else:
+                            st.warning("No negative sentiment data")
+                             # Topic Analysis Section
+                st.title("🔍 Area of Focus: Topic Analysis")
+                # Check if topic column exists, otherwise perform topic modeling
+                # if 'topic' not in df_filtered.columns:
+                #     with st.spinner("Performing topic modeling..."):
+                #         try:
+                #             # Add topic modeling here or ensure your helper functions handle it
+                #             df_filtered = helper.perform_topic_modeling(df_filtered)
+                #         except Exception as e:
+                #             st.error(f"Topic modeling failed: {str(e)}")
+                #             st.stop()
+                # Plot Topic Distribution
+                st.header("Topic Distribution")
+                try:
+                    fig = helper.plot_topic_distribution(df_filtered)
+                    st.pyplot(fig)
+                    plt.clf()
+                except Exception as e:
+                    st.warning(f"Could not display topic distribution: {str(e)}")
+                # Display Sample Messages for Each Topic
+                st.header("Sample Messages for Each Topic")
+                if 'topic' in df_filtered.columns:
+                    for topic_id in sorted(df_filtered['topic'].unique()):
+                        st.subheader(f"Topic {topic_id}")
+                        # Get messages for the current topic
+                        filtered_messages = df_filtered[df_filtered['topic'] == topic_id]['message']
+                        # Determine sample size
+                        sample_size = min(5, len(filtered_messages))
+                        if sample_size > 0:
+                            sample_messages = filtered_messages.sample(sample_size, replace=False).tolist()
+                            for msg in sample_messages:
+                                st.write(f"- {msg}")
+                        else:
+                            st.write("No messages available for this topic.")
+                else:
+                    st.warning("Topic information not available")
+                # Topic Distribution Over Time
+                st.header("📅 Topic Trends Over Time")
+                # Add time frequency selector
+                time_freq = st.selectbox("Select Time Frequency", ["Daily", "Weekly", "Monthly"], key='time_freq')
+                # Plot topic trends
+                try:
+                    freq_map = {"Daily": "D", "Weekly": "W", "Monthly": "M"}
+                    topic_distribution = helper.topic_distribution_over_time(df_filtered, time_freq=freq_map[time_freq])
+                    # Choose between static and interactive plot
+                    use_plotly = st.checkbox("Use interactive visualization", value=True, key='use_plotly')
+                    if use_plotly:
+                        fig = helper.plot_topic_distribution_over_time_plotly(topic_distribution)
+                        st.plotly_chart(fig, use_container_width=True)
+                    else:
+                        fig = helper.plot_topic_distribution_over_time(topic_distribution)
+                        st.pyplot(fig)
+                        plt.clf()
+                except Exception as e:
+                    st.warning(f"Could not display topic trends: {str(e)}")
+                # Clustering Analysis Section
+                st.title("🧩 Conversation Clusters")
+                # Number of clusters input
+                n_clusters = st.slider("Select number of clusters",
+                                       min_value=2,
+                                       max_value=10,
+                                       value=5,
+                                       key='n_clusters')
+                # Perform clustering
+                with st.spinner("Analyzing conversation clusters..."):
+                    try:
+                        df_clustered, reduced_features, _ = preprocessor.preprocess_for_clustering(df_filtered, n_clusters=n_clusters)
+                        # Plot clusters
+                        st.header("Cluster Visualization")
+                        fig = helper.plot_clusters(reduced_features, df_clustered['cluster'])
+                        st.pyplot(fig)
+                        plt.clf()
+                        # Cluster Insights
+                        st.header("📌 Cluster Insights")
+                        # 1. Dominant Conversation Themes
+                        st.subheader("1. Dominant Themes")
+                        cluster_labels = helper.get_cluster_labels(df_clustered, n_clusters)
+                        for cluster_id, label in cluster_labels.items():
+                            st.write(f"**Cluster {cluster_id}**: {label}")
+                        # 2. Temporal Patterns
+                        st.subheader("2. Temporal Patterns")
+                        temporal_trends = helper.get_temporal_trends(df_clustered)
+                        for cluster_id, trend in temporal_trends.items():
+                            st.write(f"**Cluster {cluster_id}**: Peaks on {trend['peak_day']} around {trend['peak_time']}")
+                        # 3. User Contributions
+                        if selected_user == 'Overall':
+                            st.subheader("3. Top Contributors")
+                            user_contributions = helper.get_user_contributions(df_clustered)
+                            for cluster_id, users in user_contributions.items():
+                                st.write(f"**Cluster {cluster_id}**: {', '.join(users[:3])}...")
+                        # 4. Sentiment by Cluster
+                        st.subheader("4. Sentiment Analysis")
+                        sentiment_by_cluster = helper.get_sentiment_by_cluster(df_clustered)
+                        for cluster_id, sentiment in sentiment_by_cluster.items():
+                            st.write(f"**Cluster {cluster_id}**: {sentiment['positive']}% positive, {sentiment['neutral']}% neutral, {sentiment['negative']}% negative")
+                        # Sample messages from each cluster
+                        st.subheader("Sample Messages")
+                        for cluster_id in sorted(df_clustered['cluster'].unique()):
+                            with st.expander(f"Cluster {cluster_id} Messages"):
+                                cluster_msgs = df_clustered[df_clustered['cluster'] == cluster_id]['message']
+                                sample_size = min(3, len(cluster_msgs))
+                                if sample_size > 0:
+                                    for msg in cluster_msgs.sample(sample_size, replace=False):
+                                        st.write(f"- {msg}")
+                                else:
+                                    st.write("No messages available")
+                    except Exception as e:
+                        st.error(f"Clustering failed: {str(e)}")