Spaces:
Running
Running
Commit
Β·
0ad99b7
1
Parent(s):
92feaec
Add GUI and Refactor Pipelines for reusability
Browse files- .gitignore +1 -0
- gui/app.py +202 -0
- pipelines/app_pipeline.py +0 -179
- pipelines/applicant_pipeline.py +59 -0
- pipelines/core/applicant.py +125 -0
- pipelines/core/recruiter.py +100 -0
- pipelines/recruiter_pipeline.py +34 -128
- src/feature_engg/bert_embedding_data.py +40 -20
- src/feature_engg/tfidf_vectorizing_data.py +13 -10
- src/utils/file_reader.py +3 -0
.gitignore
CHANGED
@@ -215,4 +215,5 @@ data/processed/*.txt
|
|
215 |
data/raw/*/*csv
|
216 |
data/saved_plots/
|
217 |
models/
|
|
|
218 |
tests/
|
|
|
215 |
data/raw/*/*csv
|
216 |
data/saved_plots/
|
217 |
models/
|
218 |
+
__pycache__/
|
219 |
tests/
|
gui/app.py
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
+
import pandas as pd
|
5 |
+
import shutil
|
6 |
+
import sys
|
7 |
+
import altair as alt
|
8 |
+
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
|
9 |
+
from src.utils.bulk_loading import bulk_load_raw_resume_files
|
10 |
+
from src.utils.file_reader import extract_text_from_file
|
11 |
+
from pipelines.core.applicant import run_tfidf_pipeline as applicant_tfidf, run_bert_pipeline as applicant_bert
|
12 |
+
from pipelines.core.recruiter import rank_with_tfidf as recruiter_tfidf, rank_with_bert as recruiter_bert
|
13 |
+
|
14 |
+
# --- App Configuration ---
|
15 |
+
st.set_page_config(
|
16 |
+
page_title="Resume-Job Matcher",
|
17 |
+
page_icon="π",
|
18 |
+
layout="wide"
|
19 |
+
)
|
20 |
+
|
21 |
+
# --- Main App ---
|
22 |
+
st.title("π― AI-Powered Resume-Job Matcher")
|
23 |
+
st.write("---")
|
24 |
+
|
25 |
+
# --- Sidebar for Mode Selection ---
|
26 |
+
with st.sidebar:
|
27 |
+
st.header("Controls")
|
28 |
+
app_mode = st.radio(
|
29 |
+
"Choose your view",
|
30 |
+
("Applicant", "Recruiter"),
|
31 |
+
help="Select 'Applicant' to match your resume to jobs. Select 'Recruiter' to rank resumes for a job."
|
32 |
+
)
|
33 |
+
model_choice = st.selectbox(
|
34 |
+
"Choose the AI Model",
|
35 |
+
("TF-IDF", "BERT"),
|
36 |
+
help="TF-IDF is faster. BERT is more accurate."
|
37 |
+
)
|
38 |
+
|
39 |
+
st.write("---")
|
40 |
+
|
41 |
+
# Add a checkbox to control the 'show all' feature
|
42 |
+
show_all = st.checkbox("Show all matches", value=False)
|
43 |
+
|
44 |
+
if show_all:
|
45 |
+
top_k = None
|
46 |
+
# Disable the slider when 'show_all' is checked for better UX
|
47 |
+
st.slider(
|
48 |
+
"Number of matches to show",
|
49 |
+
min_value=1, max_value=50, value=5, step=1,
|
50 |
+
disabled=True
|
51 |
+
)
|
52 |
+
st.info("Showing all ranked results.")
|
53 |
+
else:
|
54 |
+
# Enable the slider when 'show_all' is unchecked
|
55 |
+
top_k = st.slider(
|
56 |
+
"Number of matches to show",
|
57 |
+
min_value=1, max_value=50, value=5, step=1,
|
58 |
+
disabled=False
|
59 |
+
)
|
60 |
+
|
61 |
+
|
62 |
+
# --- Applicant View ---
|
63 |
+
if app_mode == "Applicant":
|
64 |
+
st.header("Applicant: Match Your Resume to a Job")
|
65 |
+
|
66 |
+
resume_file = st.file_uploader(
|
67 |
+
"Upload your resume",
|
68 |
+
type=['pdf', 'docx', 'txt'],
|
69 |
+
help="Please upload your resume in PDF, DOCX, or TXT format."
|
70 |
+
)
|
71 |
+
|
72 |
+
if resume_file:
|
73 |
+
st.success(f"β
Successfully uploaded `{resume_file.name}`")
|
74 |
+
if st.button("Find Top Job Matches", type="primary", use_container_width=True):
|
75 |
+
|
76 |
+
with st.spinner(f"Analyzing resume with {model_choice}..."):
|
77 |
+
|
78 |
+
tmp_file_path = None
|
79 |
+
try:
|
80 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(resume_file.name)[1]) as tmp_file:
|
81 |
+
tmp_file.write(resume_file.getvalue())
|
82 |
+
tmp_file_path = tmp_file.name
|
83 |
+
|
84 |
+
raw_resume_text = extract_text_from_file(tmp_file_path)
|
85 |
+
|
86 |
+
if model_choice == "BERT":
|
87 |
+
matches, message = applicant_bert(raw_resume_text, top_k=top_k)
|
88 |
+
else:
|
89 |
+
matches, message = applicant_tfidf(raw_resume_text, top_k=top_k)
|
90 |
+
|
91 |
+
if not matches:
|
92 |
+
st.warning("β οΈ No suitable job matches found.")
|
93 |
+
else:
|
94 |
+
st.subheader(f"Top {len(matches)} Job Matches:")
|
95 |
+
st.info(message)
|
96 |
+
|
97 |
+
df = pd.DataFrame(matches, columns=["Job Title", "Match Score"])
|
98 |
+
|
99 |
+
# Sort the DataFrame by 'Match Score' in descending order to show best matches at the top
|
100 |
+
df = df.sort_values(by="Match Score", ascending=False).reset_index(drop=True)
|
101 |
+
|
102 |
+
chart = alt.Chart(df).mark_bar().encode(
|
103 |
+
y=alt.Y('Job Title', sort='-x', title=None),
|
104 |
+
x=alt.X('Match Score', axis=None, scale=alt.Scale(domainMin=0)),
|
105 |
+
|
106 |
+
# Tooltip to reveal score on hover
|
107 |
+
tooltip=['Job Title', alt.Tooltip('Match Score', format='.3f')]
|
108 |
+
).properties(
|
109 |
+
# Set a responsive title for the chart to indicate what the bars represent
|
110 |
+
title="Relative Job Match Scores"
|
111 |
+
).interactive()
|
112 |
+
|
113 |
+
st.altair_chart(chart, use_container_width=True)
|
114 |
+
|
115 |
+
except Exception as e:
|
116 |
+
st.error(f"An error occurred: {e}")
|
117 |
+
|
118 |
+
finally:
|
119 |
+
if tmp_file_path and os.path.exists(tmp_file_path):
|
120 |
+
os.unlink(tmp_file_path)
|
121 |
+
|
122 |
+
|
123 |
+
# --- Recruiter View ---
|
124 |
+
if app_mode == "Recruiter":
|
125 |
+
st.header("Recruiter: Rank Resumes for a Job Description")
|
126 |
+
|
127 |
+
job_desc_file = st.file_uploader(
|
128 |
+
"Upload the job description",
|
129 |
+
type=['pdf', 'docx', 'txt'],
|
130 |
+
help="Upload the job description in PDF, DOCX, or TXT format."
|
131 |
+
)
|
132 |
+
|
133 |
+
resume_files = st.file_uploader(
|
134 |
+
"Upload candidate resumes",
|
135 |
+
type=['pdf', 'docx', 'txt'],
|
136 |
+
accept_multiple_files=True,
|
137 |
+
help="Upload one or more resumes."
|
138 |
+
)
|
139 |
+
|
140 |
+
if job_desc_file and resume_files:
|
141 |
+
st.success(f"β
Successfully uploaded job description `{job_desc_file.name}` and {len(resume_files)} resumes.")
|
142 |
+
if st.button("Rank Resumes", type="primary", use_container_width=True):
|
143 |
+
|
144 |
+
with st.spinner(f"Ranking {len(resume_files)} resumes with {model_choice}..."):
|
145 |
+
|
146 |
+
# Paths for cleanup in the finally block
|
147 |
+
temp_dir = None
|
148 |
+
job_desc_path = None
|
149 |
+
|
150 |
+
try:
|
151 |
+
# 1. Handle the single job description file
|
152 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(job_desc_file.name)[1]) as tmp_file:
|
153 |
+
tmp_file.write(job_desc_file.getvalue())
|
154 |
+
job_desc_path = tmp_file.name
|
155 |
+
raw_job_text = extract_text_from_file(job_desc_path)
|
156 |
+
|
157 |
+
# 2. Handle multiple resume files by creating a temp directory for bulk loading
|
158 |
+
temp_dir = tempfile.mkdtemp()
|
159 |
+
for resume_file in resume_files:
|
160 |
+
resume_path = os.path.join(temp_dir, resume_file.name)
|
161 |
+
with open(resume_path, "wb") as f:
|
162 |
+
f.write(resume_file.getbuffer())
|
163 |
+
|
164 |
+
# Bulk loading all resumes from the temp directory
|
165 |
+
raw_resume_texts = bulk_load_raw_resume_files(temp_dir)
|
166 |
+
|
167 |
+
# 3. Call the appropriate model's pipeline based on the model choice (default to TF-IDF)
|
168 |
+
if model_choice == "BERT":
|
169 |
+
ranked_resumes, message = recruiter_bert(raw_job_text, raw_resume_texts, top_k=top_k)
|
170 |
+
else:
|
171 |
+
ranked_resumes, message = recruiter_tfidf(raw_job_text, raw_resume_texts, top_k=top_k)
|
172 |
+
|
173 |
+
# 4. Display results
|
174 |
+
if not ranked_resumes:
|
175 |
+
st.warning("β οΈ Could not rank resumes. Please check the files.")
|
176 |
+
else:
|
177 |
+
st.subheader(f"Top {len(ranked_resumes)} Ranked Resumes:")
|
178 |
+
st.info(message)
|
179 |
+
df = pd.DataFrame(ranked_resumes, columns=["Resume", "Match Score"])
|
180 |
+
|
181 |
+
df["Match Score"] = df["Match Score"].apply(lambda x: min(1.0, x))
|
182 |
+
st.dataframe(
|
183 |
+
df,
|
184 |
+
column_config={"Resume": st.column_config.TextColumn("Resume"),
|
185 |
+
"Match Score": st.column_config.ProgressColumn("Match Score",
|
186 |
+
format="%.2f",
|
187 |
+
min_value=0,
|
188 |
+
max_value=1,),
|
189 |
+
},
|
190 |
+
use_container_width=True,
|
191 |
+
hide_index=True,
|
192 |
+
)
|
193 |
+
|
194 |
+
except Exception as e:
|
195 |
+
st.error(f"β οΈAn error occurred: {e}")
|
196 |
+
|
197 |
+
finally:
|
198 |
+
# 5. Clean up all temporary files and the directory
|
199 |
+
if job_desc_path and os.path.exists(job_desc_path):
|
200 |
+
os.unlink(job_desc_path)
|
201 |
+
if temp_dir and os.path.exists(temp_dir):
|
202 |
+
shutil.rmtree(temp_dir)
|
pipelines/app_pipeline.py
DELETED
@@ -1,179 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
import os
|
3 |
-
import pandas as pd
|
4 |
-
from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer, load_tfidf_matrix
|
5 |
-
from src.feature_engg.bert_embedding_data import get_bert_model, load_faiss_index
|
6 |
-
from src.processing.text_cleaning import clean_text, clean_text_for_bert
|
7 |
-
from src.matching.matching_engine import compute_similarity_matrix, top_n_tfidf_matches, top_n_bert_matches
|
8 |
-
from src.utils.file_reader import extract_text_from_file
|
9 |
-
|
10 |
-
|
11 |
-
def load_job_titles(job_csv_path: str):
|
12 |
-
df = pd.read_csv(job_csv_path)
|
13 |
-
if "title" not in df.columns:
|
14 |
-
raise ValueError("Job CSV must contain a 'title' column.")
|
15 |
-
return df
|
16 |
-
|
17 |
-
|
18 |
-
# ------------------------- TF-IDF PIPELINE -------------------------
|
19 |
-
def run_tfidf_pipeline(args, raw_resume: str):
|
20 |
-
|
21 |
-
# Step 1: Clean resume
|
22 |
-
cleaned_resume = clean_text(raw_resume)
|
23 |
-
|
24 |
-
# Step 2: Load vectorizer + job matrix (local first, fallback HF)
|
25 |
-
vectorizer = load_tfidf_vectorizer(
|
26 |
-
local_vectorizer_path=args.local_vectorizer_path,
|
27 |
-
repo_id=args.tfidf_repo_id,
|
28 |
-
filename=args.vectorizer_filename
|
29 |
-
)
|
30 |
-
job_matrix = load_tfidf_matrix(
|
31 |
-
local_matrix_path=args.local_matrix_path,
|
32 |
-
repo_id=args.tfidf_repo_id,
|
33 |
-
filename=args.matrix_filename
|
34 |
-
)
|
35 |
-
|
36 |
-
# Step 3: Vectorize resume
|
37 |
-
resume_vector = vectorizer.transform([cleaned_resume])
|
38 |
-
|
39 |
-
# Step 4: Compute cosine similarity
|
40 |
-
sim_matrix = compute_similarity_matrix(resume_vector, job_matrix)
|
41 |
-
|
42 |
-
# Step 5: Load job titles
|
43 |
-
job_df = load_job_titles("data/app_data/tfidf_job_titles.csv")
|
44 |
-
|
45 |
-
# Step 6: Get top-N job matches
|
46 |
-
top_k = args.top_k
|
47 |
-
|
48 |
-
if args.top_k > len(job_df['title'].unique()):
|
49 |
-
print(f"β οΈ Requested top_k={args.top_k} exceeds unique job titles={len(job_df['title'].unique())}. Reducing top_k.")
|
50 |
-
top_k = len(job_df['title'].unique())
|
51 |
-
|
52 |
-
elif args.top_k is None:
|
53 |
-
top_k = len(job_df['title'].unique())
|
54 |
-
print(f"\nβΉοΈ Showing all {top_k} job titles.\n")
|
55 |
-
|
56 |
-
matches = top_n_tfidf_matches(sim_matrix, top_n=top_k, job_df=job_df)
|
57 |
-
|
58 |
-
print(f"\nπ― Top {top_k} Job Matches for the Resume (TF-IDF):")
|
59 |
-
for job_idx, score in matches[0]:
|
60 |
-
print(f"πΉ {job_df.iloc[job_idx]['title']} (score: {score:0.4f})")
|
61 |
-
|
62 |
-
if args.debug:
|
63 |
-
print("\n================ DEBUG MODE ================")
|
64 |
-
print("\nπ--- [DEBUG - TFIDF] Cleaned Resume Preview:\n", cleaned_resume[:1000], "---")
|
65 |
-
print(f"\n--- [DEBUG - TFIDF] Raw Similarity Scores (top {top_k}) ---")
|
66 |
-
for job_idx, score in matches[0]:
|
67 |
-
print(f"[{job_idx}] {job_df.iloc[job_idx]['title']} β {score:0.6f}")
|
68 |
-
print("==============================================")
|
69 |
-
|
70 |
-
|
71 |
-
# ------------------------- BERT PIPELINE -------------------------
|
72 |
-
def run_bert_pipeline(args, raw_resume: str):
|
73 |
-
|
74 |
-
# Step 1: Load fine-tuned ST model (local or HF Hub)
|
75 |
-
model = get_bert_model(args.local_bert_path or args.bert_repo_id)
|
76 |
-
|
77 |
-
# Step 2: Load FAISS index (local or HF Hub)
|
78 |
-
job_index = load_faiss_index(
|
79 |
-
local_index_path=args.local_index_path,
|
80 |
-
repo_id=args.bert_repo_id,
|
81 |
-
filename=args.index_filename
|
82 |
-
)
|
83 |
-
|
84 |
-
# Step 3: Clean resume text for transformer
|
85 |
-
cleaned_resume = clean_text_for_bert(raw_resume)
|
86 |
-
|
87 |
-
# Step 4: Embed
|
88 |
-
resume_embedding = model.encode(
|
89 |
-
[cleaned_resume],
|
90 |
-
normalize_embeddings=True
|
91 |
-
)
|
92 |
-
|
93 |
-
# Step 5: Search
|
94 |
-
n_jobs = job_index.ntotal
|
95 |
-
D, I = job_index.search(resume_embedding, n_jobs)
|
96 |
-
|
97 |
-
# Step 6: Load job titles
|
98 |
-
job_df = load_job_titles("data/app_data/bert_job_titles.csv")
|
99 |
-
|
100 |
-
# Step 7: Rank top-N
|
101 |
-
top_k = args.top_k
|
102 |
-
|
103 |
-
if args.top_k > len(job_df['title'].unique()):
|
104 |
-
print(f"β οΈ Requested top_k={args.top_k} exceeds unique job titles={len(job_df['title'].unique())}. Reducing top_k.")
|
105 |
-
top_k = len(job_df['title'].unique())
|
106 |
-
|
107 |
-
elif args.top_k is None:
|
108 |
-
top_k = len(job_df['title'].unique())
|
109 |
-
print(f"\nβΉοΈ Showing all {top_k} job titles.\n")
|
110 |
-
|
111 |
-
matches = top_n_bert_matches(I, D, job_df, top_n=top_k)
|
112 |
-
|
113 |
-
print(f"\nπ― Top {top_k} Job Matches for the Resume (BERT):")
|
114 |
-
for idx, score in matches:
|
115 |
-
print(f"πΉ {job_df.iloc[idx]['title']} (score: {score:0.4f})")
|
116 |
-
|
117 |
-
if args.debug:
|
118 |
-
print("\n================ DEBUG MODE ================")
|
119 |
-
print(f"\n--- [DEBUG - BERT/FAISS] Raw Similarity Scores (top {top_k}) ---")
|
120 |
-
for idx, score in matches:
|
121 |
-
print(f"[{idx}] {job_df.iloc[idx]['title']} β {score:0.6f}")
|
122 |
-
print("==============================================")
|
123 |
-
|
124 |
-
|
125 |
-
# ------------------------- MAIN -------------------------
|
126 |
-
def main(args):
|
127 |
-
try:
|
128 |
-
if not os.path.exists(args.resume_path):
|
129 |
-
raise FileNotFoundError(f"β οΈ Resume file not found at: {args.resume_path}")
|
130 |
-
|
131 |
-
raw_resume = extract_text_from_file(args.resume_path)
|
132 |
-
print(f"\nπ Resume: {args.resume_path}")
|
133 |
-
|
134 |
-
# Pipeline selector
|
135 |
-
print(f"βοΈ Using model: {args.model.upper()}")
|
136 |
-
if args.model == "bert":
|
137 |
-
run_bert_pipeline(args, raw_resume)
|
138 |
-
else:
|
139 |
-
run_tfidf_pipeline(args, raw_resume)
|
140 |
-
|
141 |
-
except Exception as e:
|
142 |
-
print(f"β Error: {str(e)}")
|
143 |
-
|
144 |
-
|
145 |
-
if __name__ == "__main__":
|
146 |
-
parser = argparse.ArgumentParser(description="Match a resume to top relevant job titles")
|
147 |
-
|
148 |
-
# Shared args
|
149 |
-
parser.add_argument("--resume_path", type=str, required=True, help="Path to resume file")
|
150 |
-
parser.add_argument("--model", type=str, choices=["tfidf", "bert"], default="tfidf")
|
151 |
-
parser.add_argument("--top_k", type=int, default=None,
|
152 |
-
help="Number of top matches to return if not specified, returns all")
|
153 |
-
parser.add_argument("--debug", action="store_true",
|
154 |
-
help="print raw similarity scores for both and cleaned resume for tfidf pipeline")
|
155 |
-
|
156 |
-
# TF-IDF args
|
157 |
-
parser.add_argument("--local_vectorizer_path", type=str, default=None,
|
158 |
-
help="Local TF-IDF vectorizer .pkl file")
|
159 |
-
parser.add_argument("--local_matrix_path", type=str, default=None,
|
160 |
-
help="Local TF-IDF job matrix .npz file")
|
161 |
-
parser.add_argument("--tfidf_repo_id", type=str, default="Om-Shandilya/resume-matcher-tfidf",
|
162 |
-
help="Hub repo id for HuggingFace model")
|
163 |
-
parser.add_argument("--vectorizer_filename", type=str, default="applicant/job_vectorizer.pkl",
|
164 |
-
help="Filename of vectorizer in the HF repo")
|
165 |
-
parser.add_argument("--matrix_filename", type=str, default="applicant/job_matrix.npz",
|
166 |
-
help="Filename of matrix in the HF repo")
|
167 |
-
|
168 |
-
# BERT args
|
169 |
-
parser.add_argument("--local_bert_path", type=str, default=None,
|
170 |
-
help="Local fine-tuned ST model path")
|
171 |
-
parser.add_argument("--local_index_path", type=str, default=None,
|
172 |
-
help="Local FAISS index file path")
|
173 |
-
parser.add_argument("--bert_repo_id", type=str, default="Om-Shandilya/resume-matcher-bert",
|
174 |
-
help="fine-tuned ST model's HF repo id")
|
175 |
-
parser.add_argument("--index_filename", type=str, default="applicant/jobs.faiss",
|
176 |
-
help="Filename of FAISS index in the HF repo")
|
177 |
-
|
178 |
-
args = parser.parse_args()
|
179 |
-
main(args)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pipelines/applicant_pipeline.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse, os
|
2 |
+
from src.utils.file_reader import extract_text_from_file
|
3 |
+
from pipelines.core.applicant import run_tfidf_pipeline, run_bert_pipeline
|
4 |
+
|
5 |
+
def main(args):
|
6 |
+
try:
|
7 |
+
if not os.path.exists(args.resume_path):
|
8 |
+
raise FileNotFoundError(f"β οΈ Resume not found at {args.resume_path}")
|
9 |
+
raw_resume = extract_text_from_file(args.resume_path)
|
10 |
+
|
11 |
+
if args.model == "bert":
|
12 |
+
matches, message = run_bert_pipeline(raw_resume,
|
13 |
+
local_bert_path=args.local_bert_path,
|
14 |
+
local_index_path=args.local_index_path,
|
15 |
+
repo_id=args.bert_repo_id,
|
16 |
+
index_filename=args.index_filename,
|
17 |
+
top_k=args.top_k,
|
18 |
+
debug=args.debug)
|
19 |
+
else:
|
20 |
+
matches, message = run_tfidf_pipeline(raw_resume,
|
21 |
+
local_vectorizer_path=args.local_vectorizer_path,
|
22 |
+
local_matrix_path=args.local_matrix_path,
|
23 |
+
repo_id=args.tfidf_repo_id,
|
24 |
+
vectorizer_filename=args.vectorizer_filename,
|
25 |
+
matrix_filename=args.matrix_filename,
|
26 |
+
top_k=args.top_k,
|
27 |
+
debug=args.debug)
|
28 |
+
|
29 |
+
print(f"\n{message}")
|
30 |
+
print(f"\nπ― Top {len(matches)} Job Matches ({args.model.upper()}):")
|
31 |
+
for fname, score in matches:
|
32 |
+
print(f"πΉ {fname} (score: {score:.4f})")
|
33 |
+
|
34 |
+
except Exception as e:
|
35 |
+
print(f"β Error: {str(e)}")
|
36 |
+
|
37 |
+
if __name__ == "__main__":
|
38 |
+
parser = argparse.ArgumentParser(description="Match a resume to top relevant job titles")
|
39 |
+
parser.add_argument("--resume_path", type=str, required=True)
|
40 |
+
parser.add_argument("--model", choices=["tfidf","bert"], default="tfidf")
|
41 |
+
parser.add_argument("--top_k", type=int, default=None)
|
42 |
+
parser.add_argument("--debug", action="store_true",
|
43 |
+
help="print raw similarity scores for both and cleaned resume for tfidf pipeline")
|
44 |
+
|
45 |
+
# tfidf args
|
46 |
+
parser.add_argument("--local_vectorizer_path", type=str, default=None)
|
47 |
+
parser.add_argument("--local_matrix_path", type=str, default=None)
|
48 |
+
parser.add_argument("--tfidf_repo_id", type=str, default="Om-Shandilya/resume-matcher-tfidf")
|
49 |
+
parser.add_argument("--vectorizer_filename", type=str, default="applicant/job_vectorizer.pkl")
|
50 |
+
parser.add_argument("--matrix_filename", type=str, default="applicant/job_matrix.npz")
|
51 |
+
|
52 |
+
# bert args
|
53 |
+
parser.add_argument("--local_bert_path", type=str, default=None)
|
54 |
+
parser.add_argument("--local_index_path", type=str, default=None)
|
55 |
+
parser.add_argument("--bert_repo_id", type=str, default="Om-Shandilya/resume-matcher-bert")
|
56 |
+
parser.add_argument("--index_filename", type=str, default="applicant/jobs.faiss")
|
57 |
+
|
58 |
+
args = parser.parse_args()
|
59 |
+
main(args)
|
pipelines/core/applicant.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from pathlib import Path
|
3 |
+
from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer, load_tfidf_matrix
|
4 |
+
from src.feature_engg.bert_embedding_data import load_bert_model, load_faiss_index
|
5 |
+
from src.processing.text_cleaning import clean_text, clean_text_for_bert
|
6 |
+
from src.matching.matching_engine import compute_similarity_matrix, top_n_tfidf_matches, top_n_bert_matches
|
7 |
+
|
8 |
+
# Defining paths for data files
|
9 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
10 |
+
def load_job_titles(job_csv_path: str):
|
11 |
+
df = pd.read_csv(job_csv_path)
|
12 |
+
if "title" not in df.columns:
|
13 |
+
raise ValueError("Job CSV must contain a 'title' column.")
|
14 |
+
return df
|
15 |
+
|
16 |
+
def run_tfidf_pipeline(raw_resume: str,
|
17 |
+
local_vectorizer_path=None,
|
18 |
+
local_matrix_path=None,
|
19 |
+
repo_id="Om-Shandilya/resume-matcher-tfidf",
|
20 |
+
vectorizer_filename="applicant/job_vectorizer.pkl",
|
21 |
+
matrix_filename="applicant/job_matrix.npz",
|
22 |
+
top_k=None,
|
23 |
+
debug=False):
|
24 |
+
"""Return top-N matches using TF-IDF pipeline.
|
25 |
+
|
26 |
+
Args:
|
27 |
+
raw_resume (str): Raw text of the resume.
|
28 |
+
local_vectorizer_path (str, optional): Local path to TF-IDF vectorizer.
|
29 |
+
local_matrix_path (str, optional): Local path to TF-IDF matrix.
|
30 |
+
repo_id (str): Hugging Face repo ID for vectorizer/matrix.
|
31 |
+
vectorizer_filename (str): Filename of the vectorizer in the repo.
|
32 |
+
matrix_filename (str): Filename of the matrix in the repo.
|
33 |
+
top_k (int, optional): Number of top matches to return. If None, return all.
|
34 |
+
debug (bool, optional): Print raw similarity scores for both and cleaned resume.
|
35 |
+
|
36 |
+
Returns:
|
37 |
+
List[Tuple[str, float]]: List of (job_title, score) for top_k matches.
|
38 |
+
"""
|
39 |
+
cleaned_resume = clean_text(raw_resume)
|
40 |
+
|
41 |
+
vectorizer = load_tfidf_vectorizer(local_vectorizer_path, repo_id, vectorizer_filename)
|
42 |
+
job_matrix = load_tfidf_matrix(local_matrix_path, repo_id, matrix_filename)
|
43 |
+
|
44 |
+
resume_vector = vectorizer.transform([cleaned_resume])
|
45 |
+
sim_matrix = compute_similarity_matrix(resume_vector, job_matrix)
|
46 |
+
|
47 |
+
job_df = load_job_titles(PROJECT_ROOT / "data/app_data/tfidf_job_titles.csv")
|
48 |
+
total_jobs = len(job_df['title'].unique())
|
49 |
+
|
50 |
+
message = ""
|
51 |
+
if top_k is None:
|
52 |
+
final_top_k = total_jobs
|
53 |
+
message = f"β
Showing all {total_jobs} job matches, ranked by relevance."
|
54 |
+
elif top_k > total_jobs:
|
55 |
+
final_top_k = total_jobs
|
56 |
+
message = f"βΉοΈ You requested {top_k} matches, but only {total_jobs} are available. Showing all {total_jobs} matches."
|
57 |
+
else:
|
58 |
+
final_top_k = top_k
|
59 |
+
message = f"β
Showing the top {final_top_k} job matches."
|
60 |
+
|
61 |
+
matches = top_n_tfidf_matches(sim_matrix, top_n=final_top_k, job_df=job_df)
|
62 |
+
|
63 |
+
if debug:
|
64 |
+
print("\n================ DEBUG MODE ================")
|
65 |
+
print("\nπ--- [DEBUG - TFIDF] Cleaned Resume Preview:\n", cleaned_resume[:1000], "---")
|
66 |
+
print(f"\n--- [DEBUG - TFIDF] Raw Similarity Scores (top {final_top_k}) ---")
|
67 |
+
for job_idx, score in matches[0]:
|
68 |
+
print(f"[{job_idx}] {job_df.iloc[job_idx]['title']} β {score:0.6f}")
|
69 |
+
print("==============================================")
|
70 |
+
|
71 |
+
return [(job_df.iloc[j]['title'], score) for j, score in matches[0]],message
|
72 |
+
|
73 |
+
|
74 |
+
def run_bert_pipeline(raw_resume: str,
|
75 |
+
local_bert_path=None,
|
76 |
+
local_index_path=None,
|
77 |
+
repo_id="Om-Shandilya/resume-matcher-bert",
|
78 |
+
index_filename="applicant/jobs.faiss",
|
79 |
+
top_k=None,
|
80 |
+
debug=False):
|
81 |
+
"""Return top-N matches using BERT + FAISS pipeline.
|
82 |
+
|
83 |
+
Args:
|
84 |
+
raw_resume (str): Raw text of the resume.
|
85 |
+
local_bert_path (str, optional): Local path to BERT model.
|
86 |
+
local_index_path (str, optional): Local path to FAISS index.
|
87 |
+
repo_id (str): Hugging Face repo ID for model/index.
|
88 |
+
index_filename (str): Filename of the FAISS index in the repo.
|
89 |
+
top_k (int, optional): Number of top matches to return. If None, return all.
|
90 |
+
debug (bool, optional): Print raw similarity scores for both and cleaned resume.
|
91 |
+
|
92 |
+
Returns:
|
93 |
+
List[Tuple[str, float]]: List of (job_title, score) for top_k matches.
|
94 |
+
"""
|
95 |
+
model = load_bert_model(local_bert_path=local_bert_path, repo_id=repo_id)
|
96 |
+
job_index = load_faiss_index(local_index_path, repo_id, index_filename)
|
97 |
+
|
98 |
+
cleaned_resume = clean_text_for_bert(raw_resume)
|
99 |
+
resume_embedding = model.encode([cleaned_resume], normalize_embeddings=True)
|
100 |
+
|
101 |
+
D, I = job_index.search(resume_embedding, job_index.ntotal)
|
102 |
+
job_df = load_job_titles(PROJECT_ROOT / "data/app_data/bert_job_titles.csv")
|
103 |
+
total_jobs = len(job_df['title'].unique())
|
104 |
+
|
105 |
+
message = ""
|
106 |
+
if top_k is None:
|
107 |
+
final_top_k = total_jobs
|
108 |
+
message = f"β
Showing all {total_jobs} job matches, ranked by relevance."
|
109 |
+
elif top_k > total_jobs:
|
110 |
+
final_top_k = total_jobs
|
111 |
+
message = f"βΉοΈ You requested {top_k} matches, but only {total_jobs} are available. Showing all {total_jobs} matches."
|
112 |
+
else:
|
113 |
+
final_top_k = top_k
|
114 |
+
message = f"β
Showing the top {final_top_k} job matches."
|
115 |
+
|
116 |
+
matches = top_n_bert_matches(I, D, job_df, top_n=final_top_k)
|
117 |
+
|
118 |
+
if debug:
|
119 |
+
print("\n================ DEBUG MODE ================")
|
120 |
+
print(f"\n--- [DEBUG - BERT/FAISS] Raw Similarity Scores (top {final_top_k}) ---")
|
121 |
+
for idx, score in matches:
|
122 |
+
print(f"[{idx}] {job_df.iloc[idx]['title']} β {score:0.6f}")
|
123 |
+
print("==============================================")
|
124 |
+
|
125 |
+
return [(job_df.iloc[i]['title'], score) for i, score in matches], message
|
pipelines/core/recruiter.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
3 |
+
from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer
|
4 |
+
from src.feature_engg.bert_embedding_data import load_bert_model
|
5 |
+
from src.processing.text_cleaning import clean_text, clean_text_for_bert
|
6 |
+
|
7 |
+
|
8 |
+
def rank_with_tfidf(raw_job_text, raw_resume_texts, *,
|
9 |
+
local_vectorizer_path=None,
|
10 |
+
repo_id="Om-Shandilya/resume-matcher-tfidf",
|
11 |
+
filename="recruiter/combined_vectorizer.pkl",
|
12 |
+
top_k=None,
|
13 |
+
debug=False):
|
14 |
+
"""Rank resumes using TF-IDF similarity."""
|
15 |
+
vectorizer = load_tfidf_vectorizer(
|
16 |
+
local_vectorizer_path=local_vectorizer_path,
|
17 |
+
repo_id=repo_id,
|
18 |
+
filename=filename
|
19 |
+
)
|
20 |
+
|
21 |
+
cleaned_job_text = clean_text(raw_job_text)
|
22 |
+
job_vector = vectorizer.transform([cleaned_job_text])
|
23 |
+
|
24 |
+
cleaned_resumes = {fname: clean_text(txt) for fname, txt in raw_resume_texts.items()}
|
25 |
+
resume_matrix = vectorizer.transform(cleaned_resumes.values())
|
26 |
+
|
27 |
+
sims = cosine_similarity(job_vector, resume_matrix)[0]
|
28 |
+
ranked = sorted(zip(cleaned_resumes.keys(), sims), key=lambda x: x[1], reverse=True)
|
29 |
+
|
30 |
+
available_resumes = len(ranked)
|
31 |
+
|
32 |
+
message = ""
|
33 |
+
if top_k is None:
|
34 |
+
final_top_k = available_resumes
|
35 |
+
message = f"β
Showing all {available_resumes} job matches, ranked by relevance."
|
36 |
+
elif top_k > available_resumes:
|
37 |
+
final_top_k = available_resumes
|
38 |
+
message = f"βΉοΈ You requested {top_k} matches, but only {available_resumes} are available. Showing all {available_resumes} matches."
|
39 |
+
else:
|
40 |
+
final_top_k = top_k
|
41 |
+
message = f"β
Showing the top {final_top_k} job matches."
|
42 |
+
|
43 |
+
if debug:
|
44 |
+
print("\n================ DEBUG MODE ================")
|
45 |
+
print("\nπ--- [DEBUG - TFIDF] Cleaned Job Description Preview:\n", cleaned_job_text[:1000], "---")
|
46 |
+
print("\n--- [DEBUG - TFIDF] First 3 Cleaned Resumes ---")
|
47 |
+
for i, (fname, txt) in enumerate(cleaned_resumes.items()):
|
48 |
+
if i >= 3: break
|
49 |
+
print(f"{fname}: {txt[:300]}...\n")
|
50 |
+
print("\n--- [DEBUG - TFIDF] Raw Similarity Scores ---")
|
51 |
+
for fname, score in ranked[:final_top_k]:
|
52 |
+
print(f"{fname} β {score:0.6f}")
|
53 |
+
print("==============================================")
|
54 |
+
|
55 |
+
return [(fname, score) for fname, score in ranked[:final_top_k]], message
|
56 |
+
|
57 |
+
|
58 |
+
def rank_with_bert(raw_job_text, raw_resume_texts, *,
|
59 |
+
local_bert_path=None,
|
60 |
+
repo_id="Om-Shandilya/resume-matcher-bert",
|
61 |
+
top_k=None,
|
62 |
+
debug=False):
|
63 |
+
"""Rank resumes using BERT embeddings."""
|
64 |
+
model = load_bert_model(local_bert_path=local_bert_path, repo_id=repo_id)
|
65 |
+
|
66 |
+
cleaned_job_text = clean_text_for_bert(raw_job_text)
|
67 |
+
job_embedding = model.encode([cleaned_job_text], normalize_embeddings=True)
|
68 |
+
|
69 |
+
cleaned_resumes = {fname: clean_text_for_bert(txt) for fname, txt in raw_resume_texts.items()}
|
70 |
+
resume_embeddings = model.encode(list(cleaned_resumes.values()), normalize_embeddings=True)
|
71 |
+
|
72 |
+
sims = np.dot(resume_embeddings, job_embedding.T).flatten()
|
73 |
+
ranked = sorted(zip(cleaned_resumes.keys(), sims), key=lambda x: x[1], reverse=True)
|
74 |
+
|
75 |
+
available_resumes = len(ranked)
|
76 |
+
|
77 |
+
message = ""
|
78 |
+
if top_k is None:
|
79 |
+
final_top_k = available_resumes
|
80 |
+
message = f"β
Showing all {available_resumes} job matches, ranked by relevance."
|
81 |
+
elif top_k > available_resumes:
|
82 |
+
final_top_k = available_resumes
|
83 |
+
message = f"βΉοΈ You requested {top_k} matches, but only {available_resumes} are available. Showing all {available_resumes} matches."
|
84 |
+
else:
|
85 |
+
final_top_k = top_k
|
86 |
+
message = f"β
Showing the top {final_top_k} job matches."
|
87 |
+
|
88 |
+
if debug:
|
89 |
+
print("\n================ DEBUG MODE ================")
|
90 |
+
print("\nπ--- [DEBUG - BERT] Cleaned Job Description Preview:\n", cleaned_job_text[:1000], "---")
|
91 |
+
print("\n--- [DEBUG - BERT] First 3 Cleaned Resumes ---")
|
92 |
+
for i, (fname, txt) in enumerate(cleaned_resumes.items()):
|
93 |
+
if i >= 3: break
|
94 |
+
print(f"{fname}: {txt[:300]}...\n")
|
95 |
+
print("\n--- [DEBUG - BERT] Raw Similarity Scores ---")
|
96 |
+
for fname, score in ranked[:final_top_k]:
|
97 |
+
print(f"{fname} β {score:0.6f}")
|
98 |
+
print("==============================================")
|
99 |
+
|
100 |
+
return [(fname, score) for fname, score in ranked[:final_top_k]], message
|
pipelines/recruiter_pipeline.py
CHANGED
@@ -1,137 +1,51 @@
|
|
1 |
import argparse
|
2 |
import os
|
3 |
-
import numpy as np
|
4 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
-
from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer
|
6 |
-
from src.feature_engg.bert_embedding_data import get_bert_model
|
7 |
from src.utils.bulk_loading import bulk_load_raw_resume_files
|
8 |
from src.utils.file_reader import extract_text_from_file
|
9 |
-
from
|
10 |
|
11 |
|
12 |
-
# ------------------------- TF-IDF PIPELINE -------------------------
|
13 |
-
def run_tfidf_pipeline(args, raw_job_text, raw_resume_texts):
|
14 |
-
# Step 1: Load vectorizer (local or HF Hub)
|
15 |
-
vectorizer = load_tfidf_vectorizer(
|
16 |
-
local_vectorizer_path=args.local_vectorizer_path,
|
17 |
-
repo_id=args.tfidf_repo_id,
|
18 |
-
filename=args.vectorizer_filename
|
19 |
-
)
|
20 |
-
|
21 |
-
# Step 2: Clean job description
|
22 |
-
cleaned_job_text = clean_text(raw_job_text)
|
23 |
-
job_vector = vectorizer.transform([cleaned_job_text])
|
24 |
-
|
25 |
-
# Step 3: Clean and vectorize resumes
|
26 |
-
cleaned_resumes = {fname: clean_text(txt) for fname, txt in raw_resume_texts.items()}
|
27 |
-
resume_matrix = vectorizer.transform(cleaned_resumes.values())
|
28 |
-
|
29 |
-
# Step 4: Compute similarity
|
30 |
-
sims = cosine_similarity(job_vector, resume_matrix)[0]
|
31 |
-
|
32 |
-
# Step 5: Rank resumes
|
33 |
-
ranked = sorted(zip(cleaned_resumes.keys(), sims), key=lambda x: x[1], reverse=True)
|
34 |
-
|
35 |
-
# Step 6: Top-K handling
|
36 |
-
top_k = args.top_k
|
37 |
-
available_resumes = len(ranked)
|
38 |
-
|
39 |
-
if args.top_k is None:
|
40 |
-
top_k = available_resumes
|
41 |
-
print(f"\nβΉοΈ Showing all {available_resumes} resumes.\n")
|
42 |
-
elif args.top_k > available_resumes:
|
43 |
-
top_k = available_resumes
|
44 |
-
print(f"\nβ οΈ Requested top_k={args.top_k} exceeds available resumes={available_resumes}. Reducing top_k.\n")
|
45 |
-
|
46 |
-
print(f"\nπ― Top {top_k} Candidate Matches for the Job (TF-IDF):")
|
47 |
-
for i, (fname, score) in enumerate(ranked[:top_k], 1):
|
48 |
-
print(f"{i}. {fname} β score: {score:.4f}")
|
49 |
-
|
50 |
-
if args.debug:
|
51 |
-
print("\n================ DEBUG MODE ================")
|
52 |
-
print("\nπ--- [DEBUG - TFIDF] Cleaned Job Description Preview:\n", cleaned_job_text[:1000], "---")
|
53 |
-
print("\n--- [DEBUG - TFIDF] First 3 Cleaned Resumes ---")
|
54 |
-
for i, (fname, txt) in enumerate(cleaned_resumes.items()):
|
55 |
-
if i >= 3: break
|
56 |
-
print(f"{fname}: {txt[:300]}...\n")
|
57 |
-
print(f"\n--- [DEBUG - TFIDF] Raw Similarity Scores (top {top_k}) ---")
|
58 |
-
for fname, score in ranked[:top_k]:
|
59 |
-
print(f"{fname} β {score:0.6f}")
|
60 |
-
print("==============================================")
|
61 |
-
|
62 |
-
|
63 |
-
# ------------------------- BERT PIPELINE -------------------------
|
64 |
-
def run_bert_pipeline(args, raw_job_text, raw_resume_texts):
|
65 |
-
# Step 1: Load fine-tuned ST model (local or HF Hub)
|
66 |
-
model = get_bert_model(args.local_bert_path or args.bert_repo_id)
|
67 |
-
|
68 |
-
# Step 2: Clean job description
|
69 |
-
cleaned_job_text = clean_text_for_bert(raw_job_text)
|
70 |
-
job_embedding = model.encode([cleaned_job_text], normalize_embeddings=True)
|
71 |
-
|
72 |
-
# Step 3: Encode resumes
|
73 |
-
cleaned_resumes = {fname: clean_text_for_bert(txt) for fname, txt in raw_resume_texts.items()}
|
74 |
-
resume_embeddings = model.encode(list(cleaned_resumes.values()), normalize_embeddings=True)
|
75 |
-
|
76 |
-
# Step 4: Compute cosine similarity manually
|
77 |
-
# Using dot product as embeddings are normalized and not FAISS since we have small data here.
|
78 |
-
sims = np.dot(resume_embeddings, job_embedding.T).flatten()
|
79 |
-
|
80 |
-
# Step 5: Rank resumes
|
81 |
-
ranked = sorted(zip(cleaned_resumes.keys(), sims), key=lambda x: x[1], reverse=True)
|
82 |
-
|
83 |
-
# Step 6: Top-K handling
|
84 |
-
top_k = args.top_k
|
85 |
-
available_resumes = len(ranked)
|
86 |
-
|
87 |
-
if args.top_k is None:
|
88 |
-
top_k = available_resumes
|
89 |
-
print(f"\nβΉοΈ Showing all {available_resumes} resumes.\n")
|
90 |
-
elif args.top_k > available_resumes:
|
91 |
-
top_k = available_resumes
|
92 |
-
print(f"\nβ οΈ Requested top_k={args.top_k} exceeds available resumes={available_resumes}. Reducing top_k.\n")
|
93 |
-
|
94 |
-
print(f"\nπ― Top {top_k} Candidate Matches for the Job (BERT):")
|
95 |
-
for i, (fname, score) in enumerate(ranked[:top_k], 1):
|
96 |
-
print(f"{i}. {fname} β score: {score:.4f}")
|
97 |
-
|
98 |
-
if args.debug:
|
99 |
-
print("\n================ DEBUG MODE ================")
|
100 |
-
print("\nπ--- [DEBUG - BERT] Cleaned Job Description Preview:\n", cleaned_job_text[:1000], "---")
|
101 |
-
print("\n--- [DEBUG - BERT] First 3 Cleaned Resumes ---")
|
102 |
-
for i, (fname, txt) in enumerate(cleaned_resumes.items()):
|
103 |
-
if i >= 3: break
|
104 |
-
print(f"{fname}: {txt[:300]}...\n")
|
105 |
-
print(f"\n--- [DEBUG - BERT] Raw Similarity Scores (top {top_k}) ---")
|
106 |
-
for fname, score in ranked[:top_k]:
|
107 |
-
print(f"{fname} β {score:0.6f}")
|
108 |
-
print("==============================================")
|
109 |
-
|
110 |
-
|
111 |
-
# ------------------------- MAIN -------------------------
|
112 |
def main(args):
|
113 |
try:
|
114 |
-
# Load job description and resumes
|
115 |
if not os.path.exists(args.job_desc_path):
|
116 |
raise FileNotFoundError(f"β οΈ Job description not found: {args.job_desc_path}")
|
|
|
117 |
raw_job_text = extract_text_from_file(args.job_desc_path)
|
118 |
|
119 |
if not os.path.exists(args.resume_dir):
|
120 |
raise FileNotFoundError(f"β οΈ Resume directory not found: {args.resume_dir}")
|
|
|
121 |
raw_resume_texts = bulk_load_raw_resume_files(args.resume_dir)
|
122 |
|
123 |
if not raw_resume_texts:
|
124 |
raise ValueError("β οΈ No valid resumes found in the given directory.")
|
125 |
|
126 |
-
print(f"\nπ Job Description: {args.job_desc_path}")
|
127 |
print(f"π Loaded {len(raw_resume_texts)} resumes from {args.resume_dir}")
|
|
|
128 |
|
129 |
-
# Pipeline selector
|
130 |
-
print(f"βοΈ Using model: {args.model.upper()}")
|
131 |
if args.model == "bert":
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
else:
|
134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
except Exception as e:
|
137 |
print(f"β Error: {str(e)}")
|
@@ -140,28 +54,20 @@ def main(args):
|
|
140 |
if __name__ == "__main__":
|
141 |
parser = argparse.ArgumentParser(description="Recruiter Pipeline: Rank resumes for a given job description")
|
142 |
|
143 |
-
|
144 |
-
parser.add_argument("--
|
145 |
-
parser.add_argument("--resume_dir", type=str, required=True, help="Directory containing applicant resumes")
|
146 |
parser.add_argument("--model", type=str, choices=["tfidf", "bert"], default="tfidf")
|
147 |
-
parser.add_argument("--top_k", type=int, default=None
|
148 |
-
|
149 |
-
parser.add_argument("--debug", action="store_true",
|
150 |
-
help="print raw similarity scores and cleaned texts for debugging")
|
151 |
|
152 |
# TF-IDF args
|
153 |
-
parser.add_argument("--local_vectorizer_path", type=str, default=None
|
154 |
-
|
155 |
-
parser.add_argument("--
|
156 |
-
help="Hub repo id for HuggingFace TF-IDF model")
|
157 |
-
parser.add_argument("--vectorizer_filename", type=str, default="recruiter/combined_vectorizer.pkl",
|
158 |
-
help="Filename of vectorizer in the HF repo")
|
159 |
|
160 |
# BERT args
|
161 |
-
parser.add_argument("--local_bert_path", type=str, default=None
|
162 |
-
|
163 |
-
parser.add_argument("--bert_repo_id", type=str, default="Om-Shandilya/resume-matcher-bert",
|
164 |
-
help="fine-tuned ST model's HF repo id")
|
165 |
|
166 |
args = parser.parse_args()
|
167 |
main(args)
|
|
|
1 |
import argparse
|
2 |
import os
|
|
|
|
|
|
|
|
|
3 |
from src.utils.bulk_loading import bulk_load_raw_resume_files
|
4 |
from src.utils.file_reader import extract_text_from_file
|
5 |
+
from pipelines.core import recruiter
|
6 |
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
def main(args):
|
9 |
try:
|
|
|
10 |
if not os.path.exists(args.job_desc_path):
|
11 |
raise FileNotFoundError(f"β οΈ Job description not found: {args.job_desc_path}")
|
12 |
+
|
13 |
raw_job_text = extract_text_from_file(args.job_desc_path)
|
14 |
|
15 |
if not os.path.exists(args.resume_dir):
|
16 |
raise FileNotFoundError(f"β οΈ Resume directory not found: {args.resume_dir}")
|
17 |
+
|
18 |
raw_resume_texts = bulk_load_raw_resume_files(args.resume_dir)
|
19 |
|
20 |
if not raw_resume_texts:
|
21 |
raise ValueError("β οΈ No valid resumes found in the given directory.")
|
22 |
|
23 |
+
print(f"\nπ Loaded Job Description: {args.job_desc_path}")
|
24 |
print(f"π Loaded {len(raw_resume_texts)} resumes from {args.resume_dir}")
|
25 |
+
print(f"βοΈ Using model: {args.model.upper()}")
|
26 |
|
|
|
|
|
27 |
if args.model == "bert":
|
28 |
+
matches, message = recruiter.rank_with_bert(raw_job_text,
|
29 |
+
raw_resume_texts,
|
30 |
+
local_bert_path=args.local_bert_path,
|
31 |
+
repo_id=args.bert_repo_id,
|
32 |
+
top_k=args.top_k,
|
33 |
+
debug=args.debug
|
34 |
+
)
|
35 |
else:
|
36 |
+
matches, message = recruiter.rank_with_tfidf(raw_job_text,
|
37 |
+
raw_resume_texts,
|
38 |
+
local_vectorizer_path=args.local_vectorizer_path,
|
39 |
+
repo_id=args.tfidf_repo_id,
|
40 |
+
filename=args.vectorizer_filename,
|
41 |
+
top_k=args.top_k,
|
42 |
+
debug=args.debug
|
43 |
+
)
|
44 |
+
|
45 |
+
print(f"\n{message}")
|
46 |
+
print(f"\nπ― Top {len(matches)} Job Matches ({args.model.upper()}):")
|
47 |
+
for i, (job, score) in enumerate(matches):
|
48 |
+
print(f"{i+1})-> {job} (score: {score:.4f})")
|
49 |
|
50 |
except Exception as e:
|
51 |
print(f"β Error: {str(e)}")
|
|
|
54 |
if __name__ == "__main__":
|
55 |
parser = argparse.ArgumentParser(description="Recruiter Pipeline: Rank resumes for a given job description")
|
56 |
|
57 |
+
parser.add_argument("--job_desc_path", type=str, required=True)
|
58 |
+
parser.add_argument("--resume_dir", type=str, required=True)
|
|
|
59 |
parser.add_argument("--model", type=str, choices=["tfidf", "bert"], default="tfidf")
|
60 |
+
parser.add_argument("--top_k", type=int, default=None)
|
61 |
+
parser.add_argument("--debug", action="store_true")
|
|
|
|
|
62 |
|
63 |
# TF-IDF args
|
64 |
+
parser.add_argument("--local_vectorizer_path", type=str, default=None)
|
65 |
+
parser.add_argument("--tfidf_repo_id", type=str, default="Om-Shandilya/resume-matcher-tfidf")
|
66 |
+
parser.add_argument("--vectorizer_filename", type=str, default="recruiter/combined_vectorizer.pkl")
|
|
|
|
|
|
|
67 |
|
68 |
# BERT args
|
69 |
+
parser.add_argument("--local_bert_path", type=str, default=None)
|
70 |
+
parser.add_argument("--bert_repo_id", type=str, default="Om-Shandilya/resume-matcher-bert")
|
|
|
|
|
71 |
|
72 |
args = parser.parse_args()
|
73 |
main(args)
|
src/feature_engg/bert_embedding_data.py
CHANGED
@@ -11,19 +11,20 @@ from transformers import AutoTokenizer, AutoModel
|
|
11 |
from huggingface_hub import hf_hub_download
|
12 |
|
13 |
|
14 |
-
def get_bert_model(model_name: str
|
15 |
device: str = None):
|
16 |
"""
|
17 |
Loads a BERT-based sentence transformer model for embeddings.
|
18 |
|
19 |
Args:
|
20 |
-
model_name (str):
|
21 |
device (str, optional): "cuda", "cpu", or None (auto-detect).
|
22 |
|
23 |
Returns:
|
24 |
SentenceTransformer: Loaded model ready for encoding.
|
25 |
"""
|
26 |
device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
27 |
return SentenceTransformer(model_name, device=device)
|
28 |
|
29 |
|
@@ -102,25 +103,44 @@ def bert_embed_text(df: pd.DataFrame,
|
|
102 |
|
103 |
def load_faiss_index(local_index_path: str, repo_id: str, filename: str):
|
104 |
"""Load FAISS index, preferring local then HF Hub."""
|
105 |
-
if local_index_path
|
|
|
|
|
106 |
print(f"π Loading local FAISS index from {local_index_path}")
|
107 |
-
return read_index(local_index_path)
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
def load_bert_model(
|
114 |
-
"""
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
def mean_pooling(model_output, attention_mask):
|
126 |
"""Mean pooling for sentence embeddings."""
|
|
|
11 |
from huggingface_hub import hf_hub_download
|
12 |
|
13 |
|
14 |
+
def get_bert_model(model_name: str,
|
15 |
device: str = None):
|
16 |
"""
|
17 |
Loads a BERT-based sentence transformer model for embeddings.
|
18 |
|
19 |
Args:
|
20 |
+
model_name (str): Hugging Face model name or path.
|
21 |
device (str, optional): "cuda", "cpu", or None (auto-detect).
|
22 |
|
23 |
Returns:
|
24 |
SentenceTransformer: Loaded model ready for encoding.
|
25 |
"""
|
26 |
device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
27 |
+
print(f"π Loading BERT model '{model_name}' on device: {device}")
|
28 |
return SentenceTransformer(model_name, device=device)
|
29 |
|
30 |
|
|
|
103 |
|
104 |
def load_faiss_index(local_index_path: str, repo_id: str, filename: str):
|
105 |
"""Load FAISS index, preferring local then HF Hub."""
|
106 |
+
if local_index_path:
|
107 |
+
if not os.path.exists(local_index_path):
|
108 |
+
raise FileNotFoundError(f"β Local FAISS index not found at {local_index_path}")
|
109 |
print(f"π Loading local FAISS index from {local_index_path}")
|
110 |
+
return faiss.read_index(local_index_path)
|
111 |
+
|
112 |
+
print(f"π Downloading FAISS index from Hugging Face Hub ({repo_id}/{filename})")
|
113 |
+
faiss_path = hf_hub_download(repo_id=repo_id, filename=filename)
|
114 |
+
return faiss.read_index(faiss_path)
|
115 |
+
|
116 |
+
def load_bert_model(local_bert_path: str, repo_id: str='Om-Shandilya/resume-matcher-bert'):
|
117 |
+
"""
|
118 |
+
Load a SentenceTransformer BERT model:
|
119 |
+
- If local_model_path is provided, it must be a valid path.
|
120 |
+
- If local_model_path is None, download from Hugging Face Hub.
|
121 |
+
"""
|
122 |
+
if local_bert_path is None:
|
123 |
+
try:
|
124 |
+
print(f"π Downloading BERT model from Hugging Face Hub ({repo_id})")
|
125 |
+
model = SentenceTransformer(repo_id)
|
126 |
+
return model
|
127 |
+
except Exception as e:
|
128 |
+
raise RuntimeError(f"β Failed to download model from Hugging Face Hub ({repo_id}). Error: {e}")
|
129 |
+
|
130 |
+
|
131 |
+
if not os.path.exists(local_bert_path):
|
132 |
+
raise FileNotFoundError(
|
133 |
+
f"β The specified local path does not exist: '{local_bert_path}'. "
|
134 |
+
"Please provide a correct path or set it to None to download from the Hub."
|
135 |
+
)
|
136 |
+
|
137 |
+
try:
|
138 |
+
print(f"π Loading local BERT model from {local_bert_path}")
|
139 |
+
model = SentenceTransformer(local_bert_path)
|
140 |
+
return model
|
141 |
+
except Exception as e:
|
142 |
+
raise RuntimeError(f"β Failed to load local model from '{local_bert_path}'. Error: {e}")
|
143 |
+
|
144 |
|
145 |
def mean_pooling(model_output, attention_mask):
|
146 |
"""Mean pooling for sentence embeddings."""
|
src/feature_engg/tfidf_vectorizing_data.py
CHANGED
@@ -111,21 +111,24 @@ def tfidf_vectorize_text(df: pd.DataFrame,
|
|
111 |
|
112 |
def load_tfidf_vectorizer(local_vectorizer_path: str, repo_id: str, filename: str):
|
113 |
"""Load TF-IDF vectorizer, preferring local then HF Hub."""
|
114 |
-
if local_vectorizer_path
|
|
|
|
|
115 |
print(f"π Loading local TF-IDF vectorizer from {local_vectorizer_path}")
|
116 |
return joblib.load(local_vectorizer_path)
|
117 |
-
else:
|
118 |
-
print(f"π Downloading TF-IDF vectorizer from Hugging Face Hub ({repo_id})")
|
119 |
-
vec_path = hf_hub_download(repo_id=repo_id, filename=filename)
|
120 |
-
return joblib.load(vec_path)
|
121 |
|
|
|
|
|
|
|
122 |
|
123 |
def load_tfidf_matrix(local_matrix_path: str, repo_id: str, filename: str):
|
124 |
"""Load TF-IDF matrix, preferring local then HF Hub."""
|
125 |
-
if local_matrix_path
|
|
|
|
|
126 |
print(f"π Loading local TF-IDF matrix from {local_matrix_path}")
|
127 |
return load_npz(local_matrix_path)
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
|
|
111 |
|
112 |
def load_tfidf_vectorizer(local_vectorizer_path: str, repo_id: str, filename: str):
|
113 |
"""Load TF-IDF vectorizer, preferring local then HF Hub."""
|
114 |
+
if local_vectorizer_path:
|
115 |
+
if not os.path.exists(local_vectorizer_path):
|
116 |
+
raise FileNotFoundError(f"β Local TF-IDF vectorizer not found at {local_vectorizer_path}")
|
117 |
print(f"π Loading local TF-IDF vectorizer from {local_vectorizer_path}")
|
118 |
return joblib.load(local_vectorizer_path)
|
|
|
|
|
|
|
|
|
119 |
|
120 |
+
print(f"π Downloading TF-IDF vectorizer from Hugging Face Hub ({repo_id}/{filename})")
|
121 |
+
vec_path = hf_hub_download(repo_id=repo_id, filename=filename)
|
122 |
+
return joblib.load(vec_path)
|
123 |
|
124 |
def load_tfidf_matrix(local_matrix_path: str, repo_id: str, filename: str):
|
125 |
"""Load TF-IDF matrix, preferring local then HF Hub."""
|
126 |
+
if local_matrix_path:
|
127 |
+
if not os.path.exists(local_matrix_path):
|
128 |
+
raise FileNotFoundError(f"β Local TF-IDF matrix not found at {local_matrix_path}")
|
129 |
print(f"π Loading local TF-IDF matrix from {local_matrix_path}")
|
130 |
return load_npz(local_matrix_path)
|
131 |
+
|
132 |
+
print(f"π Downloading TF-IDF matrix from Hugging Face Hub ({repo_id}/{filename})")
|
133 |
+
mat_path = hf_hub_download(repo_id=repo_id, filename=filename)
|
134 |
+
return load_npz(mat_path)
|
src/utils/file_reader.py
CHANGED
@@ -53,10 +53,13 @@ def extract_text_from_file(file_path):
|
|
53 |
|
54 |
ext = os.path.splitext(file_path)[1].lower()
|
55 |
if ext == '.pdf':
|
|
|
56 |
return extract_text_from_pdf(file_path)
|
57 |
elif ext == '.docx':
|
|
|
58 |
return extract_text_from_docx(file_path)
|
59 |
elif ext == '.txt':
|
|
|
60 |
return extract_text_from_txt(file_path)
|
61 |
else:
|
62 |
raise ValueError(f"Unsupported file type: {ext}")
|
|
|
53 |
|
54 |
ext = os.path.splitext(file_path)[1].lower()
|
55 |
if ext == '.pdf':
|
56 |
+
print(f"Extracting text from PDF {file_path}")
|
57 |
return extract_text_from_pdf(file_path)
|
58 |
elif ext == '.docx':
|
59 |
+
print(f"Extracting text from DOCX {file_path}")
|
60 |
return extract_text_from_docx(file_path)
|
61 |
elif ext == '.txt':
|
62 |
+
print(f"Extracting text from TXT {file_path}")
|
63 |
return extract_text_from_txt(file_path)
|
64 |
else:
|
65 |
raise ValueError(f"Unsupported file type: {ext}")
|