Spaces:

Daksh0505
/

Game-Of-Thrones-Character-Similarity

Running

App Files Files Community

Daksh0505 commited on Sep 18

Commit

60d52c9

verified ·

1 Parent(s): 4e53b85

Upload 4 files

Browse files

Files changed (4) hide show

app2.py +336 -0
characters_list_got.joblib +3 -0
embeddings_got.joblib +3 -0
tfidf_embeddings_got.joblib +3 -0

app2.py ADDED Viewed

	@@ -0,0 +1,336 @@

+import streamlit as st
+import joblib
+import pandas as pd
+import numpy as np
+import os
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.manifold import TSNE
+from sklearn.decomposition import PCA
+from PIL import Image
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+# Cache the data loading
+@st.cache_data
+def load_data():
+    characters_df = pd.DataFrame(joblib.load('characters_list_got.joblib'), columns=['character'])
+    characters_df['normalized'] = characters_df['character'].str.lower().str.strip()
+    character_names = sorted(characters_df['character'].tolist())
+    sbert_embeddings = joblib.load('embeddings_got.joblib')
+    tfidf_embeddings = joblib.load('tfidf_embeddings_got.joblib')
+    return characters_df, character_names, sbert_embeddings, tfidf_embeddings
+def name_to_folder(name):
+    return name.lower().replace(" ", "_")
+def get_image_path(name):
+    normalized = name.lower().strip()
+    folder_name = name_to_folder(normalized)
+    # Try different extensions
+    for ext in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:
+        candidate_path = os.path.join("images", folder_name, f"000001.{ext}")
+        if os.path.exists(candidate_path):
+            return candidate_path
+    # Fallback to placeholder
+    placeholder_path = "images/placeholder.jpg"
+    return placeholder_path if os.path.exists(placeholder_path) else None
+def recommend_characters(model_type, input_character, characters_df, sbert_embeddings, tfidf_embeddings):
+    input_character = input_character.lower().strip()
+    if input_character not in characters_df['normalized'].values:
+        return []
+    character_index = characters_df[characters_df['normalized'] == input_character].index[0]
+    embeddings = sbert_embeddings if model_type == "SBERT" else tfidf_embeddings
+    similarity_matrix = cosine_similarity(np.array(embeddings))
+    distances = similarity_matrix[character_index]
+    # Get top 5 similar characters
+    top_indices = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
+    results = []
+    for i, similarity_score in top_indices:
+        name = characters_df.iloc[i]['character']
+        image_path = get_image_path(name)
+        results.append((name.title(), image_path, similarity_score))
+    return results
+# Visualization functions
+@st.cache_data
+def compute_tsne_2d(embeddings, perplexity=30, random_state=42):
+    """Compute 2D t-SNE"""
+    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=random_state)
+    return tsne.fit_transform(embeddings)
+@st.cache_data
+def compute_tsne_3d(embeddings, perplexity=30, random_state=42):
+    """Compute 3D t-SNE"""
+    tsne = TSNE(n_components=3, perplexity=perplexity, random_state=random_state)
+    return tsne.fit_transform(embeddings)
+@st.cache_data
+def compute_pca_2d(embeddings):
+    """Compute 2D PCA"""
+    pca = PCA(n_components=2)
+    return pca.fit_transform(embeddings)
+@st.cache_data
+def compute_pca_3d(embeddings):
+    """Compute 3D PCA"""
+    pca = PCA(n_components=3)
+    return pca.fit_transform(embeddings)
+def create_2d_plot(coords, characters, title, method):
+    """Create 2D scatter plot"""
+    df_plot = pd.DataFrame({
+        'x': coords[:, 0],
+        'y': coords[:, 1],
+        'character': characters
+    })
+    fig = px.scatter(
+        df_plot,
+        x='x',
+        y='y',
+        text='character',
+        title=f"{title} - {method}",
+        hover_data={'character': True, 'x': ':.3f', 'y': ':.3f'}
+    )
+    fig.update_traces(
+        textposition="top center",
+        textfont_size=8,
+        marker=dict(size=8, opacity=0.7)
+    )
+    fig.update_layout(
+        height=600,
+        showlegend=False,
+        xaxis_title=f"{method} Component 1",
+        yaxis_title=f"{method} Component 2"
+    )
+    return fig
+def create_3d_plot(coords, characters, title, method):
+    """Create 3D scatter plot"""
+    fig = go.Figure(data=[go.Scatter3d(
+        x=coords[:, 0],
+        y=coords[:, 1],
+        z=coords[:, 2],
+        mode='markers+text',
+        text=characters,
+        textposition="top center",
+        textfont_size=8,
+        marker=dict(
+            size=6,
+            opacity=0.7,
+            color=coords[:, 0],  # Color by first component
+            colorscale='Viridis',
+            showscale=True
+        ),
+        hovertemplate='<b>%{text}</b><br>' +
+                      f'{method} 1: %{{x:.3f}}<br>' +
+                      f'{method} 2: %{{y:.3f}}<br>' +
+                      f'{method} 3: %{{z:.3f}}<br>' +
+                      '<extra></extra>'
+    )])
+    fig.update_layout(
+        title=f"{title} - {method}",
+        scene=dict(
+            xaxis_title=f"{method} Component 1",
+            yaxis_title=f"{method} Component 2",
+            zaxis_title=f"{method} Component 3"
+        ),
+        height=600
+    )
+    return fig
+# Streamlit App
+def main():
+    st.set_page_config(
+        page_title="GoT Character Similarity Explorer",
+        page_icon="⚔️",
+        layout="wide"
+    )
+    st.title("⚔️ Game of Thrones Character Similarity Explorer")
+    # Load data
+    characters_df, character_names, sbert_embeddings, tfidf_embeddings = load_data()
+    # Create tabs
+    tab1, tab2 = st.tabs(["🔍 Character Similarity", "📊 Dimensionality Reduction"])
+    with tab1:
+        st.markdown("Select a model and character to view top semantic matches!")
+        # Sidebar controls
+        with st.sidebar:
+            st.header("Settings")
+            model_type = st.radio(
+                "Select Embedding Model:",
+                ["SBERT", "TFIDF"],
+                help="Choose between SBERT (semantic) or TF-IDF (keyword-based) similarity"
+            )
+            selected_character = st.selectbox(
+                "Choose Character:",
+                character_names,
+                help="Select a character to find similar ones"
+            )
+            if st.button("Find Similar Characters", type="primary"):
+                st.session_state.search_clicked = True
+            else:
+                st.session_state.search_clicked = getattr(st.session_state, 'search_clicked', False)
+        # Main content
+        if st.session_state.search_clicked and selected_character:
+            st.subheader(f"Characters similar to **{selected_character}** (using {model_type})")
+            # Get recommendations
+            results = recommend_characters(
+                model_type, selected_character, characters_df, sbert_embeddings, tfidf_embeddings
+            )
+            if results:
+                # Display in columns
+                cols = st.columns(5)
+                for idx, (name, image_path, similarity) in enumerate(results):
+                    with cols[idx]:
+                        if image_path and os.path.exists(image_path):
+                            try:
+                                image = Image.open(image_path)
+                                st.image(image, use_container_width=True)
+                            except Exception as e:
+                                st.error(f"Could not load image: {e}")
+                        else:
+                            st.info("No image available")
+                        st.markdown(f"**{name}**")
+                        st.caption(f"Similarity: {similarity:.3f}")
+            else:
+                st.error("Character not found or no similar characters available.")
+        else:
+            # Welcome message
+            st.info("👈 Select a character from the sidebar and click 'Find Similar Characters' to get started!")
+            # Show some stats
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Total Characters", len(character_names))
+            with col2:
+                st.metric("Embedding Models", "2")
+            with col3:
+                st.metric("Similarity Algorithm", "Cosine")
+    with tab2:
+        st.markdown("### Interactive Dimensionality Reduction Visualizations")
+        st.markdown("Explore character embeddings in 2D and 3D space using t-SNE and PCA")
+        # Controls for visualization
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            viz_model = st.selectbox(
+                "Embedding Model:",
+                ["SBERT", "TFIDF"],
+                key="viz_model"
+            )
+        with col2:
+            viz_method = st.selectbox(
+                "Reduction Method:",
+                ["t-SNE", "PCA"],
+                key="viz_method"
+            )
+        with col3:
+            viz_dims = st.selectbox(
+                "Dimensions:",
+                ["2D", "3D"],
+                key="viz_dims"
+            )
+        # Additional parameters for t-SNE
+        if viz_method == "t-SNE":
+            perplexity = st.slider(
+                "Perplexity (t-SNE parameter):",
+                min_value=5,
+                max_value=50,
+                value=30,
+                help="Lower values focus on local structure, higher values on global structure"
+            )
+        # Generate visualization button
+        if st.button("Generate Visualization", type="primary"):
+            with st.spinner(f"Computing {viz_method} {viz_dims} for {viz_model} embeddings..."):
+                # Get the right embeddings
+                embeddings = np.array(sbert_embeddings) if viz_model == "SBERT" else np.array(tfidf_embeddings)
+                characters = characters_df['character'].tolist()
+                try:
+                    # Compute coordinates based on method and dimensions
+                    if viz_method == "t-SNE" and viz_dims == "2D":
+                        coords = compute_tsne_2d(embeddings, perplexity=perplexity if viz_method == "t-SNE" else 30)
+                        fig = create_2d_plot(coords, characters, f"{viz_model} Embeddings", "t-SNE")
+                    elif viz_method == "t-SNE" and viz_dims == "3D":
+                        coords = compute_tsne_3d(embeddings, perplexity=perplexity if viz_method == "t-SNE" else 30)
+                        fig = create_3d_plot(coords, characters, f"{viz_model} Embeddings", "t-SNE")
+                    elif viz_method == "PCA" and viz_dims == "2D":
+                        coords = compute_pca_2d(embeddings)
+                        fig = create_2d_plot(coords, characters, f"{viz_model} Embeddings", "PCA")
+                    elif viz_method == "PCA" and viz_dims == "3D":
+                        coords = compute_pca_3d(embeddings)
+                        fig = create_3d_plot(coords, characters, f"{viz_model} Embeddings", "PCA")
+                    # Display the plot
+                    st.plotly_chart(fig, use_container_width=True)
+                    # Show some information about the visualization
+                    st.info(f"""
+                    **Visualization Info:**
+                    - Model: {viz_model}
+                    - Method: {viz_method} {viz_dims}
+                    - Characters: {len(characters)}
+                    - Original dimensions: {embeddings.shape[1]}
+                    """ + (f"- Perplexity: {perplexity}" if viz_method == "t-SNE" else ""))
+                except Exception as e:
+                    st.error(f"Error generating visualization: {str(e)}")
+        # Information about methods
+        with st.expander("ℹ️ About Dimensionality Reduction Methods"):
+            st.markdown("""
+            **t-SNE (t-Distributed Stochastic Neighbor Embedding):**
+            - Great for visualizing clusters and local neighborhoods
+            - Non-linear method that preserves local structure
+            - Good for finding groups of similar characters
+            - Perplexity controls local vs global structure focus
+            **PCA (Principal Component Analysis):**
+            - Linear method that preserves global variance
+            - Shows the main directions of variation in the data
+            - Faster computation than t-SNE
+            - Components have interpretable meaning
+            **2D vs 3D:**
+            - 2D is easier to interpret and interact with
+            - 3D can reveal additional structure but may be harder to read
+            """)
+if __name__ == "__main__":
+    main()

characters_list_got.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edfff5a75d926592b2f646ab7e88eece666b7ff3dcf78a599f010f88422fd0af
+size 1810

embeddings_got.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dcc7af34e18c61e74630ba2446ad1773dfd47c2054b47c56382986c3d947d305
+size 377714

tfidf_embeddings_got.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb3e10c1b896b2a42d0fb774f6219122ceefa09669ab9b46e7c9c893d9c4c9aa
+size 9782794