Spaces:
Running
Running
import os | |
import re | |
from datetime import datetime | |
import pandas as pd | |
from collections import Counter | |
from wordcloud import WordCloud | |
import matplotlib.pyplot as plt | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sklearn.feature_extraction.text import CountVectorizer | |
import seaborn as sns | |
def handle_plot(save_path: str = '../data/saved_plots', | |
filename: str = None, | |
label: str = None, | |
add_timestamp: bool = True, | |
show_plot: bool = True): | |
""" | |
Handles saving or displaying a plot based on provided parameters, with optional timestamp in filename. | |
Args: | |
save_path (str): Directory to save the plot. | |
filename (str): Name of the file to save the plot (e.g., 'plot.png'). If None, the plot will be displayed. | |
label (str): [Required with {filename} to save] Label to prepend to the filename (e.g., 'jobs', 'resumes'). | |
If None, the plot will be displayed instead. | |
add_timestamp (bool): Whether to add a timestamp to the filename. Default is True. | |
Returns: | |
None: Displays the plot or saves it to the specified path. | |
""" | |
if label and filename: | |
os.makedirs(save_path, exist_ok=True) | |
base, ext = os.path.splitext(filename) | |
if not ext: | |
ext = ".png" | |
# Add label if given | |
label_prefix = f"{label}_" if label else "" | |
# Add timestamp if requested | |
timestamp_suffix = f"_{datetime.now().strftime('%d-%m-%Y_%H-%M-%S')}" if add_timestamp else "" | |
# Final filename | |
final_name = f"{label_prefix}{base}{timestamp_suffix}{ext}" | |
save_full_path = os.path.join(save_path, final_name) | |
os.makedirs(os.path.dirname(save_full_path), exist_ok=True) | |
plt.tight_layout() | |
plt.savefig(save_full_path, bbox_inches='tight') | |
plt.close() | |
print(f"✅ Plot saved to: {save_full_path}") | |
if show_plot: | |
plt.show() | |
else: | |
plt.tight_layout() | |
plt.show() | |
def visualize_word_frequency(df: pd.DataFrame, | |
column_to_visualize: str ='text_cleaned', | |
number_of_words: int = 30, | |
save_path: str = '../data/saved_plots/word_frequencies', | |
plot_label: str = None): | |
""" | |
Visualizes word frequency in a specified column of a DataFrame. | |
Args: | |
df (pd.DataFrame): DataFrame containing the text data. | |
column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'. | |
top_words (int): Number of top words to display. Default is 30. | |
save_path (str): Directory to save the heatmap plot. | |
plot_label (str): Label to prepend to the filename when saving the plot. | |
Returns: | |
None: Displays a bar plot of the top specified number of words. | |
""" | |
# extract words from the specified column as str | |
words = df[column_to_visualize].dropna().tolist() | |
# Combine all cleaned text into one | |
all_words = " ".join(words) | |
# Get frequency | |
word_freq = Counter(all_words.split()) | |
# Top specified number of words | |
top_words = {word: freq for word, freq in word_freq.most_common(number_of_words) if word.strip() != ""} | |
# Bar Plot | |
plt.figure(figsize=(12, 6)) | |
plt.bar(top_words.keys(), top_words.values()) | |
plt.xlabel("words") | |
plt.ylabel("frequency") | |
plt.xticks(rotation=45, ha='right') | |
plt.tight_layout() | |
plt.title(f"Top {number_of_words} Words in Resume Corpus") | |
# Save or show the plot | |
handle_plot(save_path, filename = f'word_freq_top_{number_of_words}.png', label=plot_label) | |
def plot_wordcloud(df: pd.DataFrame, | |
column_to_visualize: str ='text_cleaned', | |
save_path: str = '../data/saved_plots/wordclouds', | |
plot_label: str = None,): | |
""" | |
Visualizes a word cloud from a specified column of a DataFrame. | |
Args: | |
df (pd.DataFrame): DataFrame containing the text data. | |
column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'. | |
save_path (str): Directory to save the heatmap plot. | |
plot_label (str): Label to prepend to the filename when saving the plot. | |
Returns: | |
None: Displays a word cloud of the words in the specified column. | |
""" | |
# extract words from the specified column as str | |
words = df[column_to_visualize].dropna().tolist() | |
# Combine all cleaned text into one | |
all_words = " ".join(words) | |
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words) | |
plt.figure(figsize=(12, 6)) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis('off') | |
plt.title("WordCloud") | |
# Save or show the plot | |
handle_plot(save_path, f'wordcloud.png', label=plot_label) | |
def plot_length_distribution(df: pd.DataFrame, | |
column_to_visualize: str ='text_cleaned', | |
number_of_bins: int = 30, | |
save_path: str = '../data/saved_plots/length_distributions', | |
plot_label: str = None): | |
""" | |
Visualizes the distribution of text lengths in a specified column of a DataFrame. | |
Args: | |
df (pd.DataFrame): DataFrame containing the text data. | |
column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'. | |
number_of_bins (int): Number of bins for the histogram. Default is 30. | |
save_path (str): Directory to save the heatmap plot. | |
plot_label (str): Label to prepend to the filename when saving the plot. | |
Returns: | |
None: Displays a histogram of text lengths. | |
""" | |
# Create a new column with the length of each text | |
text_lens = df[column_to_visualize].apply(lambda x: len(x.split()) if isinstance(x, str) else 0) | |
# Plot length distributions | |
plt.figure(figsize=(10, 4)) | |
plt.hist(text_lens, bins=number_of_bins, alpha=0.7) | |
plt.xlabel("Number of Words") | |
plt.ylabel("Frequency") | |
plt.title("Document Length Distribution") | |
# Save or show the plot | |
handle_plot(save_path, f'length_distribution.png', label=plot_label) | |
def plot_similarity_heatmat(df: pd.DataFrame, | |
column_to_visualize: str ='text_cleaned', | |
number_of_samples: int = 100, | |
max_features: int = 100, | |
save_path: str = '../data/saved_plots/similarity_heatmaps', | |
plot_label: str = None): | |
""" | |
Visualizes the cosine similarity heatmap of documents in a specified column of a DataFrame. | |
Args: | |
df (pd.DataFrame): DataFrame containing the text data. | |
column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'. | |
number_of_samples (int): Number of samples to visualize. Default is 100. | |
max_features (int): Maximum number of features for TF-IDF vectorization. Default is 100. | |
save_path (str): Directory to save the heatmap plot. | |
plot_label (str): Label to prepend to the filename when saving the plot. | |
Returns: | |
None: Displays a heatmap of cosine similarity between documents. | |
""" | |
vec = TfidfVectorizer(max_features=max_features) | |
# Extract words from the specified column and fit and trasform as TF-IDF matrix (or a vector) | |
texts = df[column_to_visualize].dropna().iloc[:number_of_samples].tolist() | |
tfidf_matrix = vec.fit_transform(texts) | |
# Calculate and plot cosine similarity | |
similarity_matrix = cosine_similarity(tfidf_matrix) | |
sns.heatmap(similarity_matrix, cmap='coolwarm') | |
plt.title("Resume-to-Resume Similarity Heatmap") | |
# Save or show the plot | |
handle_plot(save_path, f'similarity_heatmap.png', label=plot_label) | |
def top_words_by_category(df: pd.DataFrame, | |
text_column: str, | |
category_column: str, | |
number_of_categories: int = 5, | |
number_of_words: int = 10, | |
save_path: str = '../data/saved_plots/top_words_by_categories', | |
plot_label: str = None): | |
""" | |
Visualizes the top words in each of the most frequent categories of a DataFrame. | |
Args: | |
df (pd.DataFrame): DataFrame containing the text data. | |
text_column (str): Column name containing the text data. | |
category_column (str): Column name containing the category data. | |
number_of_categories (int): Number of most frequent categories to visualize. Default is 10. | |
number_of_words (int): Number of top words to display for each category. Default is 10. | |
save_path (str): Directory to save the heatmap plot. | |
plot_label (str): Label to prepend to the filename when saving the plot. | |
Returns: | |
None: Displays a bar plot of the top words in each category. | |
""" | |
# Step 1: Identify top-N most frequent categories | |
top_categories = df[category_column].value_counts().head(number_of_categories).index.tolist() | |
# Step 2: Loop through these top categories only | |
for cat in top_categories: | |
subset = df[df[category_column] == cat] | |
# Vectorize the text | |
vec = CountVectorizer(stop_words='english', max_features=1000) | |
X = vec.fit_transform(subset[text_column]) | |
# Get word frequencies | |
word_freq = X.sum(axis=0).A1 | |
words = vec.get_feature_names_out() | |
freq_dict = dict(zip(words, word_freq)) | |
# Select top-N words | |
top_n = dict(sorted(freq_dict.items(), key=lambda x: x[1], reverse=True)[:number_of_words]) | |
# Plot | |
plt.figure(figsize=(10, 4)) | |
plt.bar(top_n.keys(), top_n.values(), color='teal') | |
plt.title(f"Top {number_of_words} Words in '{cat}' ({plot_label.capitalize()})") | |
plt.xticks(rotation=45) | |
plt.tight_layout() | |
safe_cat = re.sub(r'[^\w\-_.]', '_', cat) | |
# Save or show the plot | |
handle_plot(save_path, filename = f"top_{number_of_words}_words_in_{safe_cat}.png", label=plot_label) | |