Om-Shandilya's picture
Add feature engg + vectorization + some minor tweaks
25d0a42
raw
history blame
1.47 kB
from visualization.visualizing_data import (
visualize_word_frequency,
plot_wordcloud,
plot_length_distribution,
plot_similarity_heatmat,
top_words_by_category)
import pandas as pd
def visualize_cleaned_data(df_resumes, df_jobs, save_plots=True):
print("⏳ Visualizing Resume Dataset...")
label = "resume" if save_plots else None
visualize_word_frequency(df_resumes, number_of_words=30, plot_label=label)
plot_wordcloud(df_resumes, plot_label=label)
plot_length_distribution(df_resumes, plot_label=label)
plot_similarity_heatmat(df_resumes, number_of_samples=100, plot_label=label)
print("βœ… Resume Dataset Visualization Complete!")
label = "job" if save_plots else None
print("⏳ Visualizing Job Dataset...")
visualize_word_frequency(df_jobs, number_of_words=30, plot_label=label)
plot_wordcloud(df_jobs, plot_label=label)
plot_length_distribution(df_jobs, plot_label=label)
top_words_by_category(df=df_jobs, text_column='text_cleaned', category_column='title',
number_of_words=10, number_of_categories=5, plot_label="job")
print("βœ… Job Dataset Visualization Complete!")
if __name__ == "__main__":
from src.data.loading_data import load_or_clean_resume_data, load_or_clean_job_data
df_resumes = load_or_clean_resume_data()
df_jobs = load_or_clean_job_data(sample_size=len(df_resumes))
visualize_cleaned_data(df_resumes, df_jobs, save_plots=True)