resume-matcher-api / src /visualization /visualizing_data.py
Om-Shandilya's picture
Add feature engg + vectorization + some minor tweaks
25d0a42
import os
import re
from datetime import datetime
import pandas as pd
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
def handle_plot(save_path: str = '../data/saved_plots',
filename: str = None,
label: str = None,
add_timestamp: bool = True,
show_plot: bool = True):
"""
Handles saving or displaying a plot based on provided parameters, with optional timestamp in filename.
Args:
save_path (str): Directory to save the plot.
filename (str): Name of the file to save the plot (e.g., 'plot.png'). If None, the plot will be displayed.
label (str): [Required with {filename} to save] Label to prepend to the filename (e.g., 'jobs', 'resumes').
If None, the plot will be displayed instead.
add_timestamp (bool): Whether to add a timestamp to the filename. Default is True.
Returns:
None: Displays the plot or saves it to the specified path.
"""
if label and filename:
os.makedirs(save_path, exist_ok=True)
base, ext = os.path.splitext(filename)
if not ext:
ext = ".png"
# Add label if given
label_prefix = f"{label}_" if label else ""
# Add timestamp if requested
timestamp_suffix = f"_{datetime.now().strftime('%d-%m-%Y_%H-%M-%S')}" if add_timestamp else ""
# Final filename
final_name = f"{label_prefix}{base}{timestamp_suffix}{ext}"
save_full_path = os.path.join(save_path, final_name)
os.makedirs(os.path.dirname(save_full_path), exist_ok=True)
plt.tight_layout()
plt.savefig(save_full_path, bbox_inches='tight')
plt.close()
print(f"✅ Plot saved to: {save_full_path}")
if show_plot:
plt.show()
else:
plt.tight_layout()
plt.show()
def visualize_word_frequency(df: pd.DataFrame,
column_to_visualize: str ='text_cleaned',
number_of_words: int = 30,
save_path: str = '../data/saved_plots/word_frequencies',
plot_label: str = None):
"""
Visualizes word frequency in a specified column of a DataFrame.
Args:
df (pd.DataFrame): DataFrame containing the text data.
column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'.
top_words (int): Number of top words to display. Default is 30.
save_path (str): Directory to save the heatmap plot.
plot_label (str): Label to prepend to the filename when saving the plot.
Returns:
None: Displays a bar plot of the top specified number of words.
"""
# extract words from the specified column as str
words = df[column_to_visualize].dropna().tolist()
# Combine all cleaned text into one
all_words = " ".join(words)
# Get frequency
word_freq = Counter(all_words.split())
# Top specified number of words
top_words = {word: freq for word, freq in word_freq.most_common(number_of_words) if word.strip() != ""}
# Bar Plot
plt.figure(figsize=(12, 6))
plt.bar(top_words.keys(), top_words.values())
plt.xlabel("words")
plt.ylabel("frequency")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.title(f"Top {number_of_words} Words in Resume Corpus")
# Save or show the plot
handle_plot(save_path, filename = f'word_freq_top_{number_of_words}.png', label=plot_label)
def plot_wordcloud(df: pd.DataFrame,
column_to_visualize: str ='text_cleaned',
save_path: str = '../data/saved_plots/wordclouds',
plot_label: str = None,):
"""
Visualizes a word cloud from a specified column of a DataFrame.
Args:
df (pd.DataFrame): DataFrame containing the text data.
column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'.
save_path (str): Directory to save the heatmap plot.
plot_label (str): Label to prepend to the filename when saving the plot.
Returns:
None: Displays a word cloud of the words in the specified column.
"""
# extract words from the specified column as str
words = df[column_to_visualize].dropna().tolist()
# Combine all cleaned text into one
all_words = " ".join(words)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("WordCloud")
# Save or show the plot
handle_plot(save_path, f'wordcloud.png', label=plot_label)
def plot_length_distribution(df: pd.DataFrame,
column_to_visualize: str ='text_cleaned',
number_of_bins: int = 30,
save_path: str = '../data/saved_plots/length_distributions',
plot_label: str = None):
"""
Visualizes the distribution of text lengths in a specified column of a DataFrame.
Args:
df (pd.DataFrame): DataFrame containing the text data.
column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'.
number_of_bins (int): Number of bins for the histogram. Default is 30.
save_path (str): Directory to save the heatmap plot.
plot_label (str): Label to prepend to the filename when saving the plot.
Returns:
None: Displays a histogram of text lengths.
"""
# Create a new column with the length of each text
text_lens = df[column_to_visualize].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)
# Plot length distributions
plt.figure(figsize=(10, 4))
plt.hist(text_lens, bins=number_of_bins, alpha=0.7)
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.title("Document Length Distribution")
# Save or show the plot
handle_plot(save_path, f'length_distribution.png', label=plot_label)
def plot_similarity_heatmat(df: pd.DataFrame,
column_to_visualize: str ='text_cleaned',
number_of_samples: int = 100,
max_features: int = 100,
save_path: str = '../data/saved_plots/similarity_heatmaps',
plot_label: str = None):
"""
Visualizes the cosine similarity heatmap of documents in a specified column of a DataFrame.
Args:
df (pd.DataFrame): DataFrame containing the text data.
column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'.
number_of_samples (int): Number of samples to visualize. Default is 100.
max_features (int): Maximum number of features for TF-IDF vectorization. Default is 100.
save_path (str): Directory to save the heatmap plot.
plot_label (str): Label to prepend to the filename when saving the plot.
Returns:
None: Displays a heatmap of cosine similarity between documents.
"""
vec = TfidfVectorizer(max_features=max_features)
# Extract words from the specified column and fit and trasform as TF-IDF matrix (or a vector)
texts = df[column_to_visualize].dropna().iloc[:number_of_samples].tolist()
tfidf_matrix = vec.fit_transform(texts)
# Calculate and plot cosine similarity
similarity_matrix = cosine_similarity(tfidf_matrix)
sns.heatmap(similarity_matrix, cmap='coolwarm')
plt.title("Resume-to-Resume Similarity Heatmap")
# Save or show the plot
handle_plot(save_path, f'similarity_heatmap.png', label=plot_label)
def top_words_by_category(df: pd.DataFrame,
text_column: str,
category_column: str,
number_of_categories: int = 5,
number_of_words: int = 10,
save_path: str = '../data/saved_plots/top_words_by_categories',
plot_label: str = None):
"""
Visualizes the top words in each of the most frequent categories of a DataFrame.
Args:
df (pd.DataFrame): DataFrame containing the text data.
text_column (str): Column name containing the text data.
category_column (str): Column name containing the category data.
number_of_categories (int): Number of most frequent categories to visualize. Default is 10.
number_of_words (int): Number of top words to display for each category. Default is 10.
save_path (str): Directory to save the heatmap plot.
plot_label (str): Label to prepend to the filename when saving the plot.
Returns:
None: Displays a bar plot of the top words in each category.
"""
# Step 1: Identify top-N most frequent categories
top_categories = df[category_column].value_counts().head(number_of_categories).index.tolist()
# Step 2: Loop through these top categories only
for cat in top_categories:
subset = df[df[category_column] == cat]
# Vectorize the text
vec = CountVectorizer(stop_words='english', max_features=1000)
X = vec.fit_transform(subset[text_column])
# Get word frequencies
word_freq = X.sum(axis=0).A1
words = vec.get_feature_names_out()
freq_dict = dict(zip(words, word_freq))
# Select top-N words
top_n = dict(sorted(freq_dict.items(), key=lambda x: x[1], reverse=True)[:number_of_words])
# Plot
plt.figure(figsize=(10, 4))
plt.bar(top_n.keys(), top_n.values(), color='teal')
plt.title(f"Top {number_of_words} Words in '{cat}' ({plot_label.capitalize()})")
plt.xticks(rotation=45)
plt.tight_layout()
safe_cat = re.sub(r'[^\w\-_.]', '_', cat)
# Save or show the plot
handle_plot(save_path, filename = f"top_{number_of_words}_words_in_{safe_cat}.png", label=plot_label)