import os import re from datetime import datetime import pandas as pd from collections import Counter from wordcloud import WordCloud import matplotlib.pyplot as plt from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import CountVectorizer import seaborn as sns def handle_plot(save_path: str = '../data/saved_plots', filename: str = None, label: str = None, add_timestamp: bool = True, show_plot: bool = True): """ Handles saving or displaying a plot based on provided parameters, with optional timestamp in filename. Args: save_path (str): Directory to save the plot. filename (str): Name of the file to save the plot (e.g., 'plot.png'). If None, the plot will be displayed. label (str): [Required with {filename} to save] Label to prepend to the filename (e.g., 'jobs', 'resumes'). If None, the plot will be displayed instead. add_timestamp (bool): Whether to add a timestamp to the filename. Default is True. Returns: None: Displays the plot or saves it to the specified path. """ if label and filename: os.makedirs(save_path, exist_ok=True) base, ext = os.path.splitext(filename) if not ext: ext = ".png" # Add label if given label_prefix = f"{label}_" if label else "" # Add timestamp if requested timestamp_suffix = f"_{datetime.now().strftime('%d-%m-%Y_%H-%M-%S')}" if add_timestamp else "" # Final filename final_name = f"{label_prefix}{base}{timestamp_suffix}{ext}" save_full_path = os.path.join(save_path, final_name) os.makedirs(os.path.dirname(save_full_path), exist_ok=True) plt.tight_layout() plt.savefig(save_full_path, bbox_inches='tight') plt.close() print(f"✅ Plot saved to: {save_full_path}") if show_plot: plt.show() else: plt.tight_layout() plt.show() def visualize_word_frequency(df: pd.DataFrame, column_to_visualize: str ='text_cleaned', number_of_words: int = 30, save_path: str = '../data/saved_plots/word_frequencies', plot_label: str = None): """ Visualizes word frequency in a specified column of a DataFrame. Args: df (pd.DataFrame): DataFrame containing the text data. column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'. top_words (int): Number of top words to display. Default is 30. save_path (str): Directory to save the heatmap plot. plot_label (str): Label to prepend to the filename when saving the plot. Returns: None: Displays a bar plot of the top specified number of words. """ # extract words from the specified column as str words = df[column_to_visualize].dropna().tolist() # Combine all cleaned text into one all_words = " ".join(words) # Get frequency word_freq = Counter(all_words.split()) # Top specified number of words top_words = {word: freq for word, freq in word_freq.most_common(number_of_words) if word.strip() != ""} # Bar Plot plt.figure(figsize=(12, 6)) plt.bar(top_words.keys(), top_words.values()) plt.xlabel("words") plt.ylabel("frequency") plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.title(f"Top {number_of_words} Words in Resume Corpus") # Save or show the plot handle_plot(save_path, filename = f'word_freq_top_{number_of_words}.png', label=plot_label) def plot_wordcloud(df: pd.DataFrame, column_to_visualize: str ='text_cleaned', save_path: str = '../data/saved_plots/wordclouds', plot_label: str = None,): """ Visualizes a word cloud from a specified column of a DataFrame. Args: df (pd.DataFrame): DataFrame containing the text data. column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'. save_path (str): Directory to save the heatmap plot. plot_label (str): Label to prepend to the filename when saving the plot. Returns: None: Displays a word cloud of the words in the specified column. """ # extract words from the specified column as str words = df[column_to_visualize].dropna().tolist() # Combine all cleaned text into one all_words = " ".join(words) wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words) plt.figure(figsize=(12, 6)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.title("WordCloud") # Save or show the plot handle_plot(save_path, f'wordcloud.png', label=plot_label) def plot_length_distribution(df: pd.DataFrame, column_to_visualize: str ='text_cleaned', number_of_bins: int = 30, save_path: str = '../data/saved_plots/length_distributions', plot_label: str = None): """ Visualizes the distribution of text lengths in a specified column of a DataFrame. Args: df (pd.DataFrame): DataFrame containing the text data. column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'. number_of_bins (int): Number of bins for the histogram. Default is 30. save_path (str): Directory to save the heatmap plot. plot_label (str): Label to prepend to the filename when saving the plot. Returns: None: Displays a histogram of text lengths. """ # Create a new column with the length of each text text_lens = df[column_to_visualize].apply(lambda x: len(x.split()) if isinstance(x, str) else 0) # Plot length distributions plt.figure(figsize=(10, 4)) plt.hist(text_lens, bins=number_of_bins, alpha=0.7) plt.xlabel("Number of Words") plt.ylabel("Frequency") plt.title("Document Length Distribution") # Save or show the plot handle_plot(save_path, f'length_distribution.png', label=plot_label) def plot_similarity_heatmat(df: pd.DataFrame, column_to_visualize: str ='text_cleaned', number_of_samples: int = 100, max_features: int = 100, save_path: str = '../data/saved_plots/similarity_heatmaps', plot_label: str = None): """ Visualizes the cosine similarity heatmap of documents in a specified column of a DataFrame. Args: df (pd.DataFrame): DataFrame containing the text data. column_to_visualize (str): Column name to visualize. Default is 'text_cleaned'. number_of_samples (int): Number of samples to visualize. Default is 100. max_features (int): Maximum number of features for TF-IDF vectorization. Default is 100. save_path (str): Directory to save the heatmap plot. plot_label (str): Label to prepend to the filename when saving the plot. Returns: None: Displays a heatmap of cosine similarity between documents. """ vec = TfidfVectorizer(max_features=max_features) # Extract words from the specified column and fit and trasform as TF-IDF matrix (or a vector) texts = df[column_to_visualize].dropna().iloc[:number_of_samples].tolist() tfidf_matrix = vec.fit_transform(texts) # Calculate and plot cosine similarity similarity_matrix = cosine_similarity(tfidf_matrix) sns.heatmap(similarity_matrix, cmap='coolwarm') plt.title("Resume-to-Resume Similarity Heatmap") # Save or show the plot handle_plot(save_path, f'similarity_heatmap.png', label=plot_label) def top_words_by_category(df: pd.DataFrame, text_column: str, category_column: str, number_of_categories: int = 5, number_of_words: int = 10, save_path: str = '../data/saved_plots/top_words_by_categories', plot_label: str = None): """ Visualizes the top words in each of the most frequent categories of a DataFrame. Args: df (pd.DataFrame): DataFrame containing the text data. text_column (str): Column name containing the text data. category_column (str): Column name containing the category data. number_of_categories (int): Number of most frequent categories to visualize. Default is 10. number_of_words (int): Number of top words to display for each category. Default is 10. save_path (str): Directory to save the heatmap plot. plot_label (str): Label to prepend to the filename when saving the plot. Returns: None: Displays a bar plot of the top words in each category. """ # Step 1: Identify top-N most frequent categories top_categories = df[category_column].value_counts().head(number_of_categories).index.tolist() # Step 2: Loop through these top categories only for cat in top_categories: subset = df[df[category_column] == cat] # Vectorize the text vec = CountVectorizer(stop_words='english', max_features=1000) X = vec.fit_transform(subset[text_column]) # Get word frequencies word_freq = X.sum(axis=0).A1 words = vec.get_feature_names_out() freq_dict = dict(zip(words, word_freq)) # Select top-N words top_n = dict(sorted(freq_dict.items(), key=lambda x: x[1], reverse=True)[:number_of_words]) # Plot plt.figure(figsize=(10, 4)) plt.bar(top_n.keys(), top_n.values(), color='teal') plt.title(f"Top {number_of_words} Words in '{cat}' ({plot_label.capitalize()})") plt.xticks(rotation=45) plt.tight_layout() safe_cat = re.sub(r'[^\w\-_.]', '_', cat) # Save or show the plot handle_plot(save_path, filename = f"top_{number_of_words}_words_in_{safe_cat}.png", label=plot_label)