|
|
import streamlit as st |
|
|
import joblib |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import os |
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
from sklearn.manifold import TSNE |
|
|
from sklearn.decomposition import PCA |
|
|
from PIL import Image |
|
|
import plotly.express as px |
|
|
import plotly.graph_objects as go |
|
|
import io |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def load_data(): |
|
|
characters_df = pd.DataFrame(joblib.load('characters_list_got.joblib'), columns=['character']) |
|
|
characters_df['normalized'] = characters_df['character'].str.lower().str.strip() |
|
|
character_names = sorted(characters_df['character'].tolist()) |
|
|
sbert_embeddings = joblib.load('embeddings_got.joblib') |
|
|
tfidf_embeddings = joblib.load('tfidf_embeddings_got.joblib') |
|
|
|
|
|
|
|
|
sbert_sim = cosine_similarity(np.array(sbert_embeddings)) |
|
|
tfidf_sim = cosine_similarity(np.array(tfidf_embeddings)) |
|
|
|
|
|
return characters_df, character_names, sbert_embeddings, tfidf_embeddings, sbert_sim, tfidf_sim |
|
|
|
|
|
def name_to_folder(name): |
|
|
return name.lower().replace(" ", "_") |
|
|
|
|
|
def get_image_path(name): |
|
|
normalized = name.lower().strip() |
|
|
folder_name = name_to_folder(normalized) |
|
|
|
|
|
for ext in ['jpg', 'jpeg', 'png', 'gif', 'bmp']: |
|
|
candidate_path = os.path.join("images", folder_name, f"000001.{ext}") |
|
|
if os.path.exists(candidate_path): |
|
|
return candidate_path |
|
|
|
|
|
placeholder_path = "images/placeholder.jpg" |
|
|
return placeholder_path if os.path.exists(placeholder_path) else None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def recommend_characters(model_type, input_character, characters_df, sbert_sim, tfidf_sim, top_n=5, weight=0.7): |
|
|
input_character = input_character.lower().strip() |
|
|
|
|
|
if input_character not in characters_df['normalized'].values: |
|
|
return [] |
|
|
|
|
|
character_index = characters_df[characters_df['normalized'] == input_character].index[0] |
|
|
|
|
|
if model_type == "Hybrid": |
|
|
similarity_matrix = weight * sbert_sim + (1 - weight) * tfidf_sim |
|
|
elif model_type == "SBERT": |
|
|
similarity_matrix = sbert_sim |
|
|
else: |
|
|
similarity_matrix = tfidf_sim |
|
|
|
|
|
distances = similarity_matrix[character_index] |
|
|
|
|
|
|
|
|
top_indices = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1: top_n + 1] |
|
|
|
|
|
results = [] |
|
|
for i, similarity_score in top_indices: |
|
|
name = characters_df.iloc[i]['character'] |
|
|
image_path = get_image_path(name) |
|
|
results.append((name.title(), image_path, similarity_score)) |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@st.cache_data |
|
|
def compute_tsne_2d(embeddings, perplexity=30, random_state=42): |
|
|
tsne = TSNE(n_components=2, perplexity=perplexity, random_state=random_state) |
|
|
return tsne.fit_transform(embeddings) |
|
|
|
|
|
@st.cache_data |
|
|
def compute_tsne_3d(embeddings, perplexity=30, random_state=42): |
|
|
tsne = TSNE(n_components=3, perplexity=perplexity, random_state=random_state) |
|
|
return tsne.fit_transform(embeddings) |
|
|
|
|
|
@st.cache_data |
|
|
def compute_pca_2d(embeddings): |
|
|
pca = PCA(n_components=2) |
|
|
return pca.fit_transform(embeddings) |
|
|
|
|
|
@st.cache_data |
|
|
def compute_pca_3d(embeddings): |
|
|
pca = PCA(n_components=3) |
|
|
return pca.fit_transform(embeddings) |
|
|
|
|
|
def create_2d_plot(coords, characters, title, method): |
|
|
df_plot = pd.DataFrame({ |
|
|
'x': coords[:, 0], |
|
|
'y': coords[:, 1], |
|
|
'character': characters |
|
|
}) |
|
|
|
|
|
fig = px.scatter( |
|
|
df_plot, |
|
|
x='x', |
|
|
y='y', |
|
|
text='character', |
|
|
title=f"{title} - {method}", |
|
|
hover_data={'character': True, 'x': ':.3f', 'y': ':.3f'} |
|
|
) |
|
|
|
|
|
fig.update_traces( |
|
|
textposition="top center", |
|
|
textfont_size=8, |
|
|
marker=dict(size=8, opacity=0.7) |
|
|
) |
|
|
|
|
|
fig.update_layout( |
|
|
height=600, |
|
|
showlegend=False, |
|
|
xaxis_title=f"{method} Component 1", |
|
|
yaxis_title=f"{method} Component 2" |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
def create_3d_plot(coords, characters, title, method): |
|
|
fig = go.Figure(data=[go.Scatter3d( |
|
|
x=coords[:, 0], |
|
|
y=coords[:, 1], |
|
|
z=coords[:, 2], |
|
|
mode='markers+text', |
|
|
text=characters, |
|
|
textposition="top center", |
|
|
textfont_size=8, |
|
|
marker=dict( |
|
|
size=6, |
|
|
opacity=0.7, |
|
|
color=coords[:, 0], |
|
|
colorscale='Viridis', |
|
|
showscale=True |
|
|
), |
|
|
hovertemplate='<b>%{text}</b><br>' + |
|
|
f'{method} 1: %{{x:.3f}}<br>' + |
|
|
f'{method} 2: %{{y:.3f}}<br>' + |
|
|
f'{method} 3: %{{z:.3f}}<br>' + |
|
|
'<extra></extra>' |
|
|
)]) |
|
|
|
|
|
fig.update_layout( |
|
|
title=f"{title} - {method}", |
|
|
scene=dict( |
|
|
xaxis_title=f"{method} Component 1", |
|
|
yaxis_title=f"{method} Component 2", |
|
|
zaxis_title=f"{method} Component 3" |
|
|
), |
|
|
height=600 |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
|
st.set_page_config( |
|
|
page_title="GoT Character Similarity Explorer", |
|
|
page_icon="βοΈ", |
|
|
layout="wide" |
|
|
) |
|
|
|
|
|
st.title("βοΈ Game of Thrones Character Similarity Explorer") |
|
|
|
|
|
|
|
|
characters_df, character_names, sbert_embeddings, tfidf_embeddings, sbert_sim, tfidf_sim = load_data() |
|
|
|
|
|
|
|
|
tab1, tab2 = st.tabs(["π Character Similarity", "π Dimensionality Reduction"]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with tab1: |
|
|
st.markdown("Select a model and character to view top semantic matches!") |
|
|
|
|
|
with st.sidebar: |
|
|
st.header("Settings") |
|
|
model_type = st.radio("Select Embedding Model:", ["SBERT", "TFIDF", "Hybrid"]) |
|
|
selected_character = st.selectbox("Choose Character:", character_names) |
|
|
|
|
|
|
|
|
top_n = st.slider("How many similar characters?", 1, 20, 5) |
|
|
|
|
|
|
|
|
weight = 0.7 |
|
|
if model_type == "Hybrid": |
|
|
weight = st.slider("Weight for SBERT (TF-IDF = 1 - weight)", 0.0, 1.0, 0.7, 0.1) |
|
|
|
|
|
if st.button("Find Similar Characters", type="primary", key="search_button"): |
|
|
st.session_state.selected_character = selected_character |
|
|
st.session_state.model_type = model_type |
|
|
st.session_state.top_n = top_n |
|
|
st.session_state.weight = weight |
|
|
|
|
|
result_placeholder = st.empty() |
|
|
|
|
|
if "selected_character" in st.session_state: |
|
|
results = recommend_characters( |
|
|
st.session_state.model_type, |
|
|
st.session_state.selected_character, |
|
|
characters_df, |
|
|
sbert_sim, |
|
|
tfidf_sim, |
|
|
top_n=st.session_state.top_n, |
|
|
weight=st.session_state.weight |
|
|
) |
|
|
|
|
|
with result_placeholder.container(): |
|
|
st.subheader( |
|
|
f"Characters similar to **{st.session_state.selected_character}** " |
|
|
f"(using {st.session_state.model_type})" |
|
|
) |
|
|
if results: |
|
|
cols = st.columns(min(5, len(results))) |
|
|
df_results = [] |
|
|
|
|
|
for idx, (name, image_path, similarity) in enumerate(results): |
|
|
df_results.append({"Character": name, "Similarity": similarity}) |
|
|
with cols[idx % len(cols)]: |
|
|
if image_path and os.path.exists(image_path): |
|
|
try: |
|
|
st.image(image_path, use_container_width=True, caption=name) |
|
|
except Exception: |
|
|
st.info("No image available") |
|
|
else: |
|
|
st.info("No image available") |
|
|
st.caption(f"Similarity: {similarity:.3f}") |
|
|
|
|
|
|
|
|
df_results = pd.DataFrame(df_results) |
|
|
csv = df_results.to_csv(index=False).encode("utf-8") |
|
|
st.download_button("π₯ Download Results as CSV", csv, "similar_characters.csv", "text/csv") |
|
|
|
|
|
else: |
|
|
st.error("No similar characters found.") |
|
|
|
|
|
else: |
|
|
st.info("π Select a character from the sidebar and click 'Find Similar Characters' to get started!") |
|
|
|
|
|
col1, col2, col3 = st.columns(3) |
|
|
|
|
|
with col1: |
|
|
st.markdown("<h3 style='text-align:center;'>Total Characters</h3>", unsafe_allow_html=True) |
|
|
st.markdown(f"<h1 style='text-align:center;'>{len(character_names)}</h1>", unsafe_allow_html=True) |
|
|
|
|
|
with col2: |
|
|
st.markdown("<h3 style='text-align:center;'>Embedding Models</h3>", unsafe_allow_html=True) |
|
|
st.markdown("<h2 style='text-align:center;'>SBERT, TF-IDF, Hybrid</h2>", unsafe_allow_html=True) |
|
|
|
|
|
with col3: |
|
|
st.markdown("<h3 style='text-align:center;'>Similarity Algorithm</h3>", unsafe_allow_html=True) |
|
|
st.markdown("<h1 style='text-align:center;'>Cosine</h1>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with tab2: |
|
|
st.markdown("### Interactive Dimensionality Reduction Visualizations") |
|
|
st.markdown("Explore character embeddings in 2D and 3D space using t-SNE and PCA") |
|
|
|
|
|
col1, col2, col3 = st.columns(3) |
|
|
with col1: |
|
|
viz_model = st.selectbox("Embedding Model:", ["SBERT", "TFIDF"], key="viz_model") |
|
|
with col2: |
|
|
viz_method = st.selectbox("Reduction Method:", ["t-SNE", "PCA"], key="viz_method") |
|
|
with col3: |
|
|
viz_dims = st.selectbox("Dimensions:", ["2D", "3D"], key="viz_dims") |
|
|
|
|
|
perplexity = 30 |
|
|
if viz_method == "t-SNE": |
|
|
perplexity = st.slider( |
|
|
"Perplexity (t-SNE parameter):", |
|
|
min_value=5, |
|
|
max_value=50, |
|
|
value=30, |
|
|
help="Lower values focus on local structure, higher values on global structure" |
|
|
) |
|
|
|
|
|
if st.button("Generate Visualization", type="primary", key="viz_button"): |
|
|
with st.spinner(f"Computing {viz_method} {viz_dims} for {viz_model} embeddings..."): |
|
|
embeddings = np.array(sbert_embeddings) if viz_model == "SBERT" else np.array(tfidf_embeddings) |
|
|
characters = characters_df['character'].tolist() |
|
|
|
|
|
try: |
|
|
|
|
|
if viz_method == "t-SNE" and viz_dims == "2D": |
|
|
coords = compute_tsne_2d(embeddings, perplexity=perplexity) |
|
|
fig = create_2d_plot(coords, characters, f"{viz_model} Embeddings", "t-SNE") |
|
|
elif viz_method == "t-SNE" and viz_dims == "3D": |
|
|
coords = compute_tsne_3d(embeddings, perplexity=perplexity) |
|
|
fig = create_3d_plot(coords, characters, f"{viz_model} Embeddings", "t-SNE") |
|
|
elif viz_method == "PCA" and viz_dims == "2D": |
|
|
coords = compute_pca_2d(embeddings) |
|
|
fig = create_2d_plot(coords, characters, f"{viz_model} Embeddings", "PCA") |
|
|
elif viz_method == "PCA" and viz_dims == "3D": |
|
|
coords = compute_pca_3d(embeddings) |
|
|
fig = create_3d_plot(coords, characters, f"{viz_model} Embeddings", "PCA") |
|
|
|
|
|
st.plotly_chart(fig, use_container_width=True) |
|
|
|
|
|
|
|
|
try: |
|
|
fig_html = fig.to_html() |
|
|
st.download_button( |
|
|
"π₯ Download Plot as HTML", |
|
|
fig_html, |
|
|
file_name=f"{viz_model}_{viz_method}_{viz_dims}.html", |
|
|
mime="text/html" |
|
|
) |
|
|
except Exception as e: |
|
|
st.error(f"Error in saving HTML File because of {e} error") |
|
|
|
|
|
st.info(f""" |
|
|
**Visualization Info:** |
|
|
- Model: {viz_model} |
|
|
- Method: {viz_method} {viz_dims} |
|
|
- Characters: {len(characters)} |
|
|
- Original dimensions: {embeddings.shape[1]} |
|
|
""" + (f"- Perplexity: {perplexity}" if viz_method == "t-SNE" else "")) |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"Error generating visualization: {e}") |
|
|
|
|
|
with st.expander("βΉοΈ About Dimensionality Reduction Methods"): |
|
|
st.markdown(""" |
|
|
**t-SNE (t-Distributed Stochastic Neighbor Embedding):** |
|
|
- Great for visualizing clusters and local neighborhoods |
|
|
- Non-linear method that preserves local structure |
|
|
- Good for finding groups of similar characters |
|
|
- Perplexity controls local vs global structure focus |
|
|
|
|
|
**PCA (Principal Component Analysis):** |
|
|
- Linear method that preserves global variance |
|
|
- Shows the main directions of variation in the data |
|
|
- Faster computation than t-SNE |
|
|
- Components have interpretable meaning |
|
|
|
|
|
**2D vs 3D:** |
|
|
- 2D is easier to interpret and interact with |
|
|
- 3D can reveal additional structure but may be harder to read |
|
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|