resume-matcher-app / src /data /loading_data.py
Om-Shandilya's picture
Add feature engg + vectorization + some minor tweaks
25d0a42
raw
history blame
3.61 kB
import pandas as pd
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.processing.text_cleaning import clean_column
def load_resume_data(path="../data/raw/resumes/Resume.csv"):
"""
Loads desired resume data from a CSV file.
Args:
path (str): Path to the CSV file containing resume data.
Returns:
pd.DataFrame: A DataFrame with meaningful resume data.
"""
# 1. Load the CSV file
df = pd.read_csv(path, encoding='utf-8')
# 2. Keep only two columns, drop rows with missing values
df = df[['Category', 'Resume_str']].dropna()
# 3. Rename columns to 'role' and 'text'
df.columns = ['role', 'text']
print(f"✅ Loaded {len(df)} resumes from {path}")
return df
def load_job_data(path="../data/raw/job_descriptions/job_Descriptions.csv",
sample_size=None,
resume_count=None):
"""
Loads desired data and samples it for job description data from a CSV file.
Args:
path (str): Path to the CSV file containing job description data.
sample_size (int): Number of job descriptions to sample.
resume_count (int): If specified, sample this many job descriptions.
Returns:
pd.DataFrame: A DataFrame with meaningful and sampled job description data.
"""
# 1. Load large CSV with all job descriptions
df = pd.read_csv(path, encoding='utf-8', low_memory=False)
# 2. Keep only two columns, drop rows with missing values
df = df[['Job Title', 'Job Description']].dropna()
# 3. Rename columns to standard names
df.columns = ['title', 'text']
# 4. If sample_size is None, use resume_count is not None set it as sample_size
if sample_size is None and resume_count is not None:
sample_size = resume_count
# 5. Randomly sample job descriptions if sample_size is specified
if sample_size is not None:
df = df.sample(n=sample_size, random_state=42)
print(f"✅ Loaded {len(df)} job descriptions from {path}")
return df
def load_or_clean_resume_data(cleaned_path="../data/processed/resumes_cleaned.csv",
raw_path="../data/raw/resumes/Resume.csv"):
if os.path.exists(cleaned_path):
print('📂 Loading cached resume data…')
df = pd.read_csv(cleaned_path)
print(f"✅ Loaded cleaned resume data from {cleaned_path}")
else:
print('📂 Loading raw resume data and 🧼 cleaning it…')
df = load_resume_data(raw_path)
df = clean_column(df, column_name='text', new_column_name='text_cleaned')
df.to_csv(cleaned_path, index=False)
print(f"🧼 Cleaned and saved resume data to {cleaned_path}")
return df
def load_or_clean_job_data(cleaned_path="../data/processed/jobs_cleaned.csv",
raw_path="../data/raw/job_descriptions/job_descriptions.csv", sample_size=None):
if os.path.exists(cleaned_path):
print('📂 Loading cached job data…')
df = pd.read_csv(cleaned_path)
print(f"✅ Loaded cleaned job description data from {cleaned_path}")
else:
print('📂 Loading raw job data and 🧼 cleaning it…')
df = load_job_data(raw_path, sample_size=sample_size)
df = clean_column(df, column_name='text', new_column_name='text_cleaned')
df.to_csv(cleaned_path, index=False)
print(f"🧼 Cleaned and saved job description data to {cleaned_path}")
return df