Spaces:

Om-Shandilya
/

resume-matcher-api

Running

App Files Files Community

resume-matcher-api / src /visualization /visualizing_data.py

Om-Shandilya

Add feature engg + vectorization + some minor tweaks

25d0a42 about 2 months ago

raw

history blame contribute delete

10.3 kB

	import os
	import re
	from datetime import datetime
	import pandas as pd
	from collections import Counter
	from wordcloud import WordCloud
	import matplotlib.pyplot as plt
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from sklearn.feature_extraction.text import CountVectorizer
	import seaborn as sns

	def handle_plot(save_path: str = '../data/saved_plots',
	filename: str = None,
	label: str = None,
	add_timestamp: bool = True,
	show_plot: bool = True):
	"""
	Handles saving or displaying a plot based on provided parameters, with optional timestamp in filename.

	Args:
	save_path (str): Directory to save the plot.
	filename (str): Name of the file to save the plot (e.g., 'plot.png'). If None, the plot will be displayed.
	label (str): [Required with {filename} to save] Label to prepend to the filename (e.g., 'jobs', 'resumes').
	If None, the plot will be displayed instead.
	add_timestamp (bool): Whether to add a timestamp to the filename. Default is True.

	Returns:
	None: Displays the plot or saves it to the specified path.
	"""
	if label and filename:
	os.makedirs(save_path, exist_ok=True)
	base, ext = os.path.splitext(filename)

	if not ext:
	ext = ".png"

	# Add label if given
	label_prefix = f"{label}_" if label else ""

	# Add timestamp if requested
	timestamp_suffix = f"_{datetime.now().strftime('%d-%m-%Y_%H-%M-%S')}" if add_timestamp else ""

	# Final filename
	final_name = f"{label_prefix}{base}{timestamp_suffix}{ext}"
	save_full_path = os.path.join(save_path, final_name)
	os.makedirs(os.path.dirname(save_full_path), exist_ok=True)

	plt.tight_layout()
	plt.savefig(save_full_path, bbox_inches='tight')
	plt.close()
	print(f"✅ Plot saved to: {save_full_path}")

	if show_plot:
	plt.show()

	else:
	plt.tight_layout()
	plt.show()

	def visualize_word_frequency(df: pd.DataFrame,
	column_to_visualize: str ='text_cleaned',
	number_of_words: int = 30,
	save_path: str = '../data/saved_plots/word_frequencies',
	plot_label: str = None):
	"""
	Visualizes word frequency in a specified column of a DataFrame.

	Args:
	df (pd.DataFrame): DataFrame containing the text data.
	column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'.
	top_words (int): Number of top words to display. Default is 30.
	save_path (str): Directory to save the heatmap plot.
	plot_label (str): Label to prepend to the filename when saving the plot.

	Returns:
	None: Displays a bar plot of the top specified number of words.
	"""
	# extract words from the specified column as str
	words = df[column_to_visualize].dropna().tolist()

	# Combine all cleaned text into one
	all_words = " ".join(words)

	# Get frequency
	word_freq = Counter(all_words.split())

	# Top specified number of words
	top_words = {word: freq for word, freq in word_freq.most_common(number_of_words) if word.strip() != ""}


	# Bar Plot
	plt.figure(figsize=(12, 6))
	plt.bar(top_words.keys(), top_words.values())
	plt.xlabel("words")
	plt.ylabel("frequency")
	plt.xticks(rotation=45, ha='right')
	plt.tight_layout()
	plt.title(f"Top {number_of_words} Words in Resume Corpus")

	# Save or show the plot
	handle_plot(save_path, filename = f'word_freq_top_{number_of_words}.png', label=plot_label)


	def plot_wordcloud(df: pd.DataFrame,
	column_to_visualize: str ='text_cleaned',
	save_path: str = '../data/saved_plots/wordclouds',
	plot_label: str = None,):

	"""
	Visualizes a word cloud from a specified column of a DataFrame.

	Args:
	df (pd.DataFrame): DataFrame containing the text data.
	column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'.
	save_path (str): Directory to save the heatmap plot.
	plot_label (str): Label to prepend to the filename when saving the plot.

	Returns:
	None: Displays a word cloud of the words in the specified column.
	"""

	# extract words from the specified column as str
	words = df[column_to_visualize].dropna().tolist()

	# Combine all cleaned text into one
	all_words = " ".join(words)

	wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words)

	plt.figure(figsize=(12, 6))
	plt.imshow(wordcloud, interpolation='bilinear')
	plt.axis('off')
	plt.title("WordCloud")

	# Save or show the plot
	handle_plot(save_path, f'wordcloud.png', label=plot_label)


	def plot_length_distribution(df: pd.DataFrame,
	column_to_visualize: str ='text_cleaned',
	number_of_bins: int = 30,
	save_path: str = '../data/saved_plots/length_distributions',
	plot_label: str = None):
	"""
	Visualizes the distribution of text lengths in a specified column of a DataFrame.

	Args:
	df (pd.DataFrame): DataFrame containing the text data.
	column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'.
	number_of_bins (int): Number of bins for the histogram. Default is 30.
	save_path (str): Directory to save the heatmap plot.
	plot_label (str): Label to prepend to the filename when saving the plot.

	Returns:
	None: Displays a histogram of text lengths.
	"""

	# Create a new column with the length of each text
	text_lens = df[column_to_visualize].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)


	# Plot length distributions
	plt.figure(figsize=(10, 4))
	plt.hist(text_lens, bins=number_of_bins, alpha=0.7)
	plt.xlabel("Number of Words")
	plt.ylabel("Frequency")
	plt.title("Document Length Distribution")

	# Save or show the plot
	handle_plot(save_path, f'length_distribution.png', label=plot_label)


	def plot_similarity_heatmat(df: pd.DataFrame,
	column_to_visualize: str ='text_cleaned',
	number_of_samples: int = 100,
	max_features: int = 100,
	save_path: str = '../data/saved_plots/similarity_heatmaps',
	plot_label: str = None):
	"""
	Visualizes the cosine similarity heatmap of documents in a specified column of a DataFrame.

	Args:
	df (pd.DataFrame): DataFrame containing the text data.
	column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'.
	number_of_samples (int): Number of samples to visualize. Default is 100.
	max_features (int): Maximum number of features for TF-IDF vectorization. Default is 100.
	save_path (str): Directory to save the heatmap plot.
	plot_label (str): Label to prepend to the filename when saving the plot.

	Returns:
	None: Displays a heatmap of cosine similarity between documents.
	"""

	vec = TfidfVectorizer(max_features=max_features)

	# Extract words from the specified column and fit and trasform as TF-IDF matrix (or a vector)
	texts = df[column_to_visualize].dropna().iloc[:number_of_samples].tolist()
	tfidf_matrix = vec.fit_transform(texts)

	# Calculate and plot cosine similarity
	similarity_matrix = cosine_similarity(tfidf_matrix)
	sns.heatmap(similarity_matrix, cmap='coolwarm')
	plt.title("Resume-to-Resume Similarity Heatmap")

	# Save or show the plot
	handle_plot(save_path, f'similarity_heatmap.png', label=plot_label)

	def top_words_by_category(df: pd.DataFrame,
	text_column: str,
	category_column: str,
	number_of_categories: int = 5,
	number_of_words: int = 10,
	save_path: str = '../data/saved_plots/top_words_by_categories',
	plot_label: str = None):
	"""
	Visualizes the top words in each of the most frequent categories of a DataFrame.

	Args:
	df (pd.DataFrame): DataFrame containing the text data.
	text_column (str): Column name containing the text data.
	category_column (str): Column name containing the category data.
	number_of_categories (int): Number of most frequent categories to visualize. Default is 10.
	number_of_words (int): Number of top words to display for each category. Default is 10.
	save_path (str): Directory to save the heatmap plot.
	plot_label (str): Label to prepend to the filename when saving the plot.

	Returns:
	None: Displays a bar plot of the top words in each category.
	"""

	# Step 1: Identify top-N most frequent categories
	top_categories = df[category_column].value_counts().head(number_of_categories).index.tolist()

	# Step 2: Loop through these top categories only
	for cat in top_categories:
	subset = df[df[category_column] == cat]

	# Vectorize the text
	vec = CountVectorizer(stop_words='english', max_features=1000)
	X = vec.fit_transform(subset[text_column])

	# Get word frequencies
	word_freq = X.sum(axis=0).A1
	words = vec.get_feature_names_out()
	freq_dict = dict(zip(words, word_freq))

	# Select top-N words
	top_n = dict(sorted(freq_dict.items(), key=lambda x: x[1], reverse=True)[:number_of_words])

	# Plot
	plt.figure(figsize=(10, 4))
	plt.bar(top_n.keys(), top_n.values(), color='teal')
	plt.title(f"Top {number_of_words} Words in '{cat}' ({plot_label.capitalize()})")
	plt.xticks(rotation=45)
	plt.tight_layout()
	safe_cat = re.sub(r'[^\w\-_.]', '_', cat)

	# Save or show the plot
	handle_plot(save_path, filename = f"top_{number_of_words}_words_in_{safe_cat}.png", label=plot_label)