resume-matcher-app / src /visualization /visualizing_data.py
Om-Shandilya's picture
Add feature engg + vectorization + some minor tweaks
25d0a42
raw
history blame
10.3 kB
import os
import re
from datetime import datetime
import pandas as pd
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
def handle_plot(save_path: str = '../data/saved_plots',
filename: str = None,
label: str = None,
add_timestamp: bool = True,
show_plot: bool = True):
"""
Handles saving or displaying a plot based on provided parameters, with optional timestamp in filename.
Args:
save_path (str): Directory to save the plot.
filename (str): Name of the file to save the plot (e.g., 'plot.png'). If None, the plot will be displayed.
label (str): [Required with {filename} to save] Label to prepend to the filename (e.g., 'jobs', 'resumes').
If None, the plot will be displayed instead.
add_timestamp (bool): Whether to add a timestamp to the filename. Default is True.
Returns:
None: Displays the plot or saves it to the specified path.
"""
if label and filename:
os.makedirs(save_path, exist_ok=True)
base, ext = os.path.splitext(filename)
if not ext:
ext = ".png"
# Add label if given
label_prefix = f"{label}_" if label else ""
# Add timestamp if requested
timestamp_suffix = f"_{datetime.now().strftime('%d-%m-%Y_%H-%M-%S')}" if add_timestamp else ""
# Final filename
final_name = f"{label_prefix}{base}{timestamp_suffix}{ext}"
save_full_path = os.path.join(save_path, final_name)
os.makedirs(os.path.dirname(save_full_path), exist_ok=True)
plt.tight_layout()
plt.savefig(save_full_path, bbox_inches='tight')
plt.close()
print(f"✅ Plot saved to: {save_full_path}")
if show_plot:
plt.show()
else:
plt.tight_layout()
plt.show()
def visualize_word_frequency(df: pd.DataFrame,
column_to_visualize: str ='text_cleaned',
number_of_words: int = 30,
save_path: str = '../data/saved_plots/word_frequencies',
plot_label: str = None):
"""
Visualizes word frequency in a specified column of a DataFrame.
Args:
df (pd.DataFrame): DataFrame containing the text data.
column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'.
top_words (int): Number of top words to display. Default is 30.
save_path (str): Directory to save the heatmap plot.
plot_label (str): Label to prepend to the filename when saving the plot.
Returns:
None: Displays a bar plot of the top specified number of words.
"""
# extract words from the specified column as str
words = df[column_to_visualize].dropna().tolist()
# Combine all cleaned text into one
all_words = " ".join(words)
# Get frequency
word_freq = Counter(all_words.split())
# Top specified number of words
top_words = {word: freq for word, freq in word_freq.most_common(number_of_words) if word.strip() != ""}
# Bar Plot
plt.figure(figsize=(12, 6))
plt.bar(top_words.keys(), top_words.values())
plt.xlabel("words")
plt.ylabel("frequency")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.title(f"Top {number_of_words} Words in Resume Corpus")
# Save or show the plot
handle_plot(save_path, filename = f'word_freq_top_{number_of_words}.png', label=plot_label)
def plot_wordcloud(df: pd.DataFrame,
column_to_visualize: str ='text_cleaned',
save_path: str = '../data/saved_plots/wordclouds',
plot_label: str = None,):
"""
Visualizes a word cloud from a specified column of a DataFrame.
Args:
df (pd.DataFrame): DataFrame containing the text data.
column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'.
save_path (str): Directory to save the heatmap plot.
plot_label (str): Label to prepend to the filename when saving the plot.
Returns:
None: Displays a word cloud of the words in the specified column.
"""
# extract words from the specified column as str
words = df[column_to_visualize].dropna().tolist()
# Combine all cleaned text into one
all_words = " ".join(words)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("WordCloud")
# Save or show the plot
handle_plot(save_path, f'wordcloud.png', label=plot_label)
def plot_length_distribution(df: pd.DataFrame,
column_to_visualize: str ='text_cleaned',
number_of_bins: int = 30,
save_path: str = '../data/saved_plots/length_distributions',
plot_label: str = None):
"""
Visualizes the distribution of text lengths in a specified column of a DataFrame.
Args:
df (pd.DataFrame): DataFrame containing the text data.
column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'.
number_of_bins (int): Number of bins for the histogram. Default is 30.
save_path (str): Directory to save the heatmap plot.
plot_label (str): Label to prepend to the filename when saving the plot.
Returns:
None: Displays a histogram of text lengths.
"""
# Create a new column with the length of each text
text_lens = df[column_to_visualize].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)
# Plot length distributions
plt.figure(figsize=(10, 4))
plt.hist(text_lens, bins=number_of_bins, alpha=0.7)
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.title("Document Length Distribution")
# Save or show the plot
handle_plot(save_path, f'length_distribution.png', label=plot_label)
def plot_similarity_heatmat(df: pd.DataFrame,
column_to_visualize: str ='text_cleaned',
number_of_samples: int = 100,
max_features: int = 100,
save_path: str = '../data/saved_plots/similarity_heatmaps',
plot_label: str = None):
"""
Visualizes the cosine similarity heatmap of documents in a specified column of a DataFrame.
Args:
df (pd.DataFrame): DataFrame containing the text data.
column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'.
number_of_samples (int): Number of samples to visualize. Default is 100.
max_features (int): Maximum number of features for TF-IDF vectorization. Default is 100.
save_path (str): Directory to save the heatmap plot.
plot_label (str): Label to prepend to the filename when saving the plot.
Returns:
None: Displays a heatmap of cosine similarity between documents.
"""
vec = TfidfVectorizer(max_features=max_features)
# Extract words from the specified column and fit and trasform as TF-IDF matrix (or a vector)
texts = df[column_to_visualize].dropna().iloc[:number_of_samples].tolist()
tfidf_matrix = vec.fit_transform(texts)
# Calculate and plot cosine similarity
similarity_matrix = cosine_similarity(tfidf_matrix)
sns.heatmap(similarity_matrix, cmap='coolwarm')
plt.title("Resume-to-Resume Similarity Heatmap")
# Save or show the plot
handle_plot(save_path, f'similarity_heatmap.png', label=plot_label)
def top_words_by_category(df: pd.DataFrame,
text_column: str,
category_column: str,
number_of_categories: int = 5,
number_of_words: int = 10,
save_path: str = '../data/saved_plots/top_words_by_categories',
plot_label: str = None):
"""
Visualizes the top words in each of the most frequent categories of a DataFrame.
Args:
df (pd.DataFrame): DataFrame containing the text data.
text_column (str): Column name containing the text data.
category_column (str): Column name containing the category data.
number_of_categories (int): Number of most frequent categories to visualize. Default is 10.
number_of_words (int): Number of top words to display for each category. Default is 10.
save_path (str): Directory to save the heatmap plot.
plot_label (str): Label to prepend to the filename when saving the plot.
Returns:
None: Displays a bar plot of the top words in each category.
"""
# Step 1: Identify top-N most frequent categories
top_categories = df[category_column].value_counts().head(number_of_categories).index.tolist()
# Step 2: Loop through these top categories only
for cat in top_categories:
subset = df[df[category_column] == cat]
# Vectorize the text
vec = CountVectorizer(stop_words='english', max_features=1000)
X = vec.fit_transform(subset[text_column])
# Get word frequencies
word_freq = X.sum(axis=0).A1
words = vec.get_feature_names_out()
freq_dict = dict(zip(words, word_freq))
# Select top-N words
top_n = dict(sorted(freq_dict.items(), key=lambda x: x[1], reverse=True)[:number_of_words])
# Plot
plt.figure(figsize=(10, 4))
plt.bar(top_n.keys(), top_n.values(), color='teal')
plt.title(f"Top {number_of_words} Words in '{cat}' ({plot_label.capitalize()})")
plt.xticks(rotation=45)
plt.tight_layout()
safe_cat = re.sub(r'[^\w\-_.]', '_', cat)
# Save or show the plot
handle_plot(save_path, filename = f"top_{number_of_words}_words_in_{safe_cat}.png", label=plot_label)