import streamlit as st
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
# Add this at the very top of the file, before any other Streamlit commands
st.set_page_config(layout="wide")
# Add custom CSS after the st.set_page_config
st.markdown("""
""", unsafe_allow_html=True)
# Cache data loading
@st.cache_data
def load_data():
df = pd.read_csv('song_dataset.csv')
return df
# Cache matrix computations
@st.cache_data
def compute_matrices(df_songsDB):
user_item_matrix = df_songsDB.pivot_table(index='user', columns='song', values='play_count', fill_value=0)
svd = TruncatedSVD(n_components=20, random_state=20)
svd_matrix = svd.fit_transform(user_item_matrix)
item_factors = svd.components_
return user_item_matrix, svd_matrix, item_factors
# Load data and compute matrices once
df_songsDB = load_data()
user_item_matrix, svd_matrix, item_factors = compute_matrices(df_songsDB)
# Cache the TF-IDF computation
@st.cache_data
def compute_tfidf(df_songsDB):
df_songsDB['combined_features'] = (
df_songsDB['artist_name'] + " " +
df_songsDB['release'] + " " +
df_songsDB['title']
)
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df_songsDB['combined_features'])
return tfidf, tfidf_matrix
# Helper functions
def content_score_calculator(selected_songs, unlistened_songs):
df_songsDB['combined_features'] = (
df_songsDB['artist_name'] + " " +
df_songsDB['release'] + " " +
df_songsDB['title']
)
selected_song_features = df_songsDB[df_songsDB['title'].isin(selected_songs)]['combined_features']
unlistened_song_features = df_songsDB[df_songsDB['song'].isin(unlistened_songs)]['combined_features']
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df_songsDB['combined_features'])
selected_matrix = tfidf.transform(selected_song_features)
unlistened_matrix = tfidf.transform(unlistened_song_features)
similarity_scores = cosine_similarity(selected_matrix, unlistened_matrix)
avg_similarity = similarity_scores.mean(axis=0)
return dict(zip(unlistened_songs, avg_similarity))
def collaborative_score_calculator(user_id, unlistened_songs):
user_idx = user_item_matrix.index.get_loc(user_id)
user_vector = svd_matrix[user_idx]
cf_scores = {}
for song_id in unlistened_songs:
if (song_id in user_item_matrix.columns):
song_idx = user_item_matrix.columns.get_loc(song_id)
song_vector = item_factors[:, song_idx]
cf_scores[song_id] = np.dot(user_vector, song_vector)
else:
cf_scores[song_id] = 0
return cf_scores
def hybridRecommendationEngine(user_id, selected_songs):
alpha = 0.5
listened_songs = df_songsDB[df_songsDB['user'] == user_id]['song'].unique()
all_songs = df_songsDB['song'].unique()
unlistened_songs = set(all_songs) - set(listened_songs)
cf_scores = collaborative_score_calculator(user_id, unlistened_songs)
content_scores = content_score_calculator(selected_songs, unlistened_songs)
final_scores = {}
for song_id in unlistened_songs:
cf_score = cf_scores.get(song_id, 0)
content_score = content_scores.get(song_id, 0)
final_scores[song_id] = alpha * cf_score + (1 - alpha) * content_score
scores = list(final_scores.values())
min_score = min(scores) if scores else 0
max_score = max(scores) if scores else 1
if max_score > min_score:
normalized_scores = {
song_id: (score - min_score) / (max_score - min_score)
for song_id, score in final_scores.items()
}
else:
normalized_scores = {song_id: 0.5 for song_id in final_scores}
sorted_songs = sorted(normalized_scores.items(), key=lambda x: x[1], reverse=True)
recommended_song_ids = [song_id for song_id, _ in sorted_songs[:10]]
recommended_songs = (
pd.DataFrame(recommended_song_ids, columns=['song'])
.merge(df_songsDB[['song', 'title', 'release', 'artist_name']].drop_duplicates(), on='song', how='left')
.assign(recommendation=lambda x: x['title'] + ' by ' + x['artist_name'])
)
return recommended_songs['recommendation'].tolist()
# Streamlit app
st.title("Delta Melody Match 🎶")
# Make columns take more width
col1, col2 = st.columns([2, 4])
with col1:
with st.container():
user_id = st.selectbox(
"👤 Select User ID",
options=df_songsDB['user'].unique().tolist(),
key="small_select"
)
st.markdown('', unsafe_allow_html=True)
songs_selectable = df_songsDB[df_songsDB['user'] == user_id]['title'].unique()
with col2:
song_titles = st.multiselect(
"🎵 Select Songs You Like",
options=songs_selectable,
default=songs_selectable[:1]
)
# Make the recommendations table wider
if st.button("Get Recommendations"):
st.subheader("Recommended Songs")
recommendations = hybridRecommendationEngine(user_id, song_titles)
for i, rec in enumerate(recommendations, 1):
# Split the recommendation into title and artist
title, artist = rec.split(' by ')
st.write(f"{i}. ***{title}*** by {artist}")