Spaces:
Running
Running
Commit
·
042558f
1
Parent(s):
0ad99b7
Add caching to the GUI
Browse files- gui/app.py +27 -7
- pipelines/core/applicant.py +21 -7
- pipelines/core/recruiter.py +39 -8
- src/feature_engg/bert_embedding_data.py +2 -3
- src/utils/model_loader.py +44 -0
gui/app.py
CHANGED
@@ -8,6 +8,7 @@ import altair as alt
|
|
8 |
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
|
9 |
from src.utils.bulk_loading import bulk_load_raw_resume_files
|
10 |
from src.utils.file_reader import extract_text_from_file
|
|
|
11 |
from pipelines.core.applicant import run_tfidf_pipeline as applicant_tfidf, run_bert_pipeline as applicant_bert
|
12 |
from pipelines.core.recruiter import rank_with_tfidf as recruiter_tfidf, rank_with_bert as recruiter_bert
|
13 |
|
@@ -71,7 +72,7 @@ if app_mode == "Applicant":
|
|
71 |
|
72 |
if resume_file:
|
73 |
st.success(f"✅ Successfully uploaded `{resume_file.name}`")
|
74 |
-
if st.button("Find Top Job Matches", type="primary",
|
75 |
|
76 |
with st.spinner(f"Analyzing resume with {model_choice}..."):
|
77 |
|
@@ -84,9 +85,20 @@ if app_mode == "Applicant":
|
|
84 |
raw_resume_text = extract_text_from_file(tmp_file_path)
|
85 |
|
86 |
if model_choice == "BERT":
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
else:
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
if not matches:
|
92 |
st.warning("⚠️ No suitable job matches found.")
|
@@ -139,7 +151,7 @@ if app_mode == "Recruiter":
|
|
139 |
|
140 |
if job_desc_file and resume_files:
|
141 |
st.success(f"✅ Successfully uploaded job description `{job_desc_file.name}` and {len(resume_files)} resumes.")
|
142 |
-
if st.button("Rank Resumes", type="primary",
|
143 |
|
144 |
with st.spinner(f"Ranking {len(resume_files)} resumes with {model_choice}..."):
|
145 |
|
@@ -166,9 +178,17 @@ if app_mode == "Recruiter":
|
|
166 |
|
167 |
# 3. Call the appropriate model's pipeline based on the model choice (default to TF-IDF)
|
168 |
if model_choice == "BERT":
|
169 |
-
|
|
|
|
|
|
|
|
|
170 |
else:
|
171 |
-
|
|
|
|
|
|
|
|
|
172 |
|
173 |
# 4. Display results
|
174 |
if not ranked_resumes:
|
@@ -187,7 +207,7 @@ if app_mode == "Recruiter":
|
|
187 |
min_value=0,
|
188 |
max_value=1,),
|
189 |
},
|
190 |
-
|
191 |
hide_index=True,
|
192 |
)
|
193 |
|
|
|
8 |
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
|
9 |
from src.utils.bulk_loading import bulk_load_raw_resume_files
|
10 |
from src.utils.file_reader import extract_text_from_file
|
11 |
+
from src.utils.model_loader import get_applicant_matrix, get_applicant_vectorizer, get_bert_model, get_faiss_index, get_recruiter_vectorizer
|
12 |
from pipelines.core.applicant import run_tfidf_pipeline as applicant_tfidf, run_bert_pipeline as applicant_bert
|
13 |
from pipelines.core.recruiter import rank_with_tfidf as recruiter_tfidf, rank_with_bert as recruiter_bert
|
14 |
|
|
|
72 |
|
73 |
if resume_file:
|
74 |
st.success(f"✅ Successfully uploaded `{resume_file.name}`")
|
75 |
+
if st.button("Find Top Job Matches", type="primary", width='stretch'):
|
76 |
|
77 |
with st.spinner(f"Analyzing resume with {model_choice}..."):
|
78 |
|
|
|
85 |
raw_resume_text = extract_text_from_file(tmp_file_path)
|
86 |
|
87 |
if model_choice == "BERT":
|
88 |
+
bert_model = get_bert_model()
|
89 |
+
faiss_index = get_faiss_index()
|
90 |
+
matches, message = applicant_bert(raw_resume_text,
|
91 |
+
model=bert_model,
|
92 |
+
job_index=faiss_index,
|
93 |
+
top_k=top_k,)
|
94 |
+
|
95 |
else:
|
96 |
+
applicant_vectorizer = get_applicant_vectorizer()
|
97 |
+
applicant_matrix = get_applicant_matrix()
|
98 |
+
matches, message = applicant_tfidf(raw_resume_text,
|
99 |
+
vectorizer=applicant_vectorizer,
|
100 |
+
job_matrix=applicant_matrix,
|
101 |
+
top_k=top_k)
|
102 |
|
103 |
if not matches:
|
104 |
st.warning("⚠️ No suitable job matches found.")
|
|
|
151 |
|
152 |
if job_desc_file and resume_files:
|
153 |
st.success(f"✅ Successfully uploaded job description `{job_desc_file.name}` and {len(resume_files)} resumes.")
|
154 |
+
if st.button("Rank Resumes", type="primary", width='stretch'):
|
155 |
|
156 |
with st.spinner(f"Ranking {len(resume_files)} resumes with {model_choice}..."):
|
157 |
|
|
|
178 |
|
179 |
# 3. Call the appropriate model's pipeline based on the model choice (default to TF-IDF)
|
180 |
if model_choice == "BERT":
|
181 |
+
bert_model = get_bert_model()
|
182 |
+
ranked_resumes, message = recruiter_bert(raw_job_text,
|
183 |
+
raw_resume_texts,
|
184 |
+
model=bert_model,
|
185 |
+
top_k=top_k)
|
186 |
else:
|
187 |
+
vectorizer = get_recruiter_vectorizer()
|
188 |
+
ranked_resumes, message = recruiter_tfidf(raw_job_text,
|
189 |
+
raw_resume_texts,
|
190 |
+
vectorizer=vectorizer,
|
191 |
+
top_k=top_k)
|
192 |
|
193 |
# 4. Display results
|
194 |
if not ranked_resumes:
|
|
|
207 |
min_value=0,
|
208 |
max_value=1,),
|
209 |
},
|
210 |
+
width='stretch',
|
211 |
hide_index=True,
|
212 |
)
|
213 |
|
pipelines/core/applicant.py
CHANGED
@@ -13,7 +13,9 @@ def load_job_titles(job_csv_path: str):
|
|
13 |
raise ValueError("Job CSV must contain a 'title' column.")
|
14 |
return df
|
15 |
|
16 |
-
def run_tfidf_pipeline(raw_resume: str,
|
|
|
|
|
17 |
local_vectorizer_path=None,
|
18 |
local_matrix_path=None,
|
19 |
repo_id="Om-Shandilya/resume-matcher-tfidf",
|
@@ -25,6 +27,8 @@ def run_tfidf_pipeline(raw_resume: str,
|
|
25 |
|
26 |
Args:
|
27 |
raw_resume (str): Raw text of the resume.
|
|
|
|
|
28 |
local_vectorizer_path (str, optional): Local path to TF-IDF vectorizer.
|
29 |
local_matrix_path (str, optional): Local path to TF-IDF matrix.
|
30 |
repo_id (str): Hugging Face repo ID for vectorizer/matrix.
|
@@ -38,8 +42,11 @@ def run_tfidf_pipeline(raw_resume: str,
|
|
38 |
"""
|
39 |
cleaned_resume = clean_text(raw_resume)
|
40 |
|
41 |
-
vectorizer
|
42 |
-
|
|
|
|
|
|
|
43 |
|
44 |
resume_vector = vectorizer.transform([cleaned_resume])
|
45 |
sim_matrix = compute_similarity_matrix(resume_vector, job_matrix)
|
@@ -68,10 +75,12 @@ def run_tfidf_pipeline(raw_resume: str,
|
|
68 |
print(f"[{job_idx}] {job_df.iloc[job_idx]['title']} → {score:0.6f}")
|
69 |
print("==============================================")
|
70 |
|
71 |
-
return [(job_df.iloc[j]['title'], score) for j, score in matches[0]],message
|
72 |
|
73 |
|
74 |
-
def run_bert_pipeline(raw_resume: str,
|
|
|
|
|
75 |
local_bert_path=None,
|
76 |
local_index_path=None,
|
77 |
repo_id="Om-Shandilya/resume-matcher-bert",
|
@@ -82,6 +91,8 @@ def run_bert_pipeline(raw_resume: str,
|
|
82 |
|
83 |
Args:
|
84 |
raw_resume (str): Raw text of the resume.
|
|
|
|
|
85 |
local_bert_path (str, optional): Local path to BERT model.
|
86 |
local_index_path (str, optional): Local path to FAISS index.
|
87 |
repo_id (str): Hugging Face repo ID for model/index.
|
@@ -92,8 +103,11 @@ def run_bert_pipeline(raw_resume: str,
|
|
92 |
Returns:
|
93 |
List[Tuple[str, float]]: List of (job_title, score) for top_k matches.
|
94 |
"""
|
95 |
-
model
|
96 |
-
|
|
|
|
|
|
|
97 |
|
98 |
cleaned_resume = clean_text_for_bert(raw_resume)
|
99 |
resume_embedding = model.encode([cleaned_resume], normalize_embeddings=True)
|
|
|
13 |
raise ValueError("Job CSV must contain a 'title' column.")
|
14 |
return df
|
15 |
|
16 |
+
def run_tfidf_pipeline(raw_resume: str, *,
|
17 |
+
vectorizer=None,
|
18 |
+
job_matrix=None,
|
19 |
local_vectorizer_path=None,
|
20 |
local_matrix_path=None,
|
21 |
repo_id="Om-Shandilya/resume-matcher-tfidf",
|
|
|
27 |
|
28 |
Args:
|
29 |
raw_resume (str): Raw text of the resume.
|
30 |
+
vectorizer (TfidfVectorizer, optional): Preloaded TF-IDF vectorizer.
|
31 |
+
job_matrix (scipy.sparse matrix, optional): Preloaded TF-IDF job matrix
|
32 |
local_vectorizer_path (str, optional): Local path to TF-IDF vectorizer.
|
33 |
local_matrix_path (str, optional): Local path to TF-IDF matrix.
|
34 |
repo_id (str): Hugging Face repo ID for vectorizer/matrix.
|
|
|
42 |
"""
|
43 |
cleaned_resume = clean_text(raw_resume)
|
44 |
|
45 |
+
if vectorizer is None:
|
46 |
+
vectorizer = load_tfidf_vectorizer(local_vectorizer_path, repo_id, vectorizer_filename)
|
47 |
+
|
48 |
+
if job_matrix is None:
|
49 |
+
job_matrix = load_tfidf_matrix(local_matrix_path, repo_id, matrix_filename)
|
50 |
|
51 |
resume_vector = vectorizer.transform([cleaned_resume])
|
52 |
sim_matrix = compute_similarity_matrix(resume_vector, job_matrix)
|
|
|
75 |
print(f"[{job_idx}] {job_df.iloc[job_idx]['title']} → {score:0.6f}")
|
76 |
print("==============================================")
|
77 |
|
78 |
+
return [(job_df.iloc[j]['title'], score) for j, score in matches[0]], message
|
79 |
|
80 |
|
81 |
+
def run_bert_pipeline(raw_resume: str, *,
|
82 |
+
model=None,
|
83 |
+
job_index=None,
|
84 |
local_bert_path=None,
|
85 |
local_index_path=None,
|
86 |
repo_id="Om-Shandilya/resume-matcher-bert",
|
|
|
91 |
|
92 |
Args:
|
93 |
raw_resume (str): Raw text of the resume.
|
94 |
+
model (SentenceTransformer, optional): Preloaded BERT model.
|
95 |
+
job_index (faiss.Index, optional): Preloaded FAISS index.
|
96 |
local_bert_path (str, optional): Local path to BERT model.
|
97 |
local_index_path (str, optional): Local path to FAISS index.
|
98 |
repo_id (str): Hugging Face repo ID for model/index.
|
|
|
103 |
Returns:
|
104 |
List[Tuple[str, float]]: List of (job_title, score) for top_k matches.
|
105 |
"""
|
106 |
+
if model is None:
|
107 |
+
model = load_bert_model(local_bert_path=local_bert_path, repo_id=repo_id)
|
108 |
+
|
109 |
+
if job_index is None:
|
110 |
+
job_index = load_faiss_index(local_index_path, repo_id, index_filename)
|
111 |
|
112 |
cleaned_resume = clean_text_for_bert(raw_resume)
|
113 |
resume_embedding = model.encode([cleaned_resume], normalize_embeddings=True)
|
pipelines/core/recruiter.py
CHANGED
@@ -6,17 +6,32 @@ from src.processing.text_cleaning import clean_text, clean_text_for_bert
|
|
6 |
|
7 |
|
8 |
def rank_with_tfidf(raw_job_text, raw_resume_texts, *,
|
|
|
9 |
local_vectorizer_path=None,
|
10 |
repo_id="Om-Shandilya/resume-matcher-tfidf",
|
11 |
filename="recruiter/combined_vectorizer.pkl",
|
12 |
top_k=None,
|
13 |
debug=False):
|
14 |
-
"""Rank resumes using TF-IDF similarity.
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
cleaned_job_text = clean_text(raw_job_text)
|
22 |
job_vector = vectorizer.transform([cleaned_job_text])
|
@@ -56,12 +71,28 @@ def rank_with_tfidf(raw_job_text, raw_resume_texts, *,
|
|
56 |
|
57 |
|
58 |
def rank_with_bert(raw_job_text, raw_resume_texts, *,
|
|
|
59 |
local_bert_path=None,
|
60 |
repo_id="Om-Shandilya/resume-matcher-bert",
|
61 |
top_k=None,
|
62 |
debug=False):
|
63 |
-
"""Rank resumes using BERT embeddings.
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
cleaned_job_text = clean_text_for_bert(raw_job_text)
|
67 |
job_embedding = model.encode([cleaned_job_text], normalize_embeddings=True)
|
|
|
6 |
|
7 |
|
8 |
def rank_with_tfidf(raw_job_text, raw_resume_texts, *,
|
9 |
+
vectorizer=None,
|
10 |
local_vectorizer_path=None,
|
11 |
repo_id="Om-Shandilya/resume-matcher-tfidf",
|
12 |
filename="recruiter/combined_vectorizer.pkl",
|
13 |
top_k=None,
|
14 |
debug=False):
|
15 |
+
"""Rank resumes using TF-IDF similarity.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
raw_job_text (str): Raw text of the job description.
|
19 |
+
raw_resume_texts (dict): Dictionary of resume filenames and their raw texts.
|
20 |
+
vectorizer (TfidfVectorizer, optional): Preloaded TF-IDF vectorizer.
|
21 |
+
local_vectorizer_path (str, optional): Local path to TF-IDF vectorizer.
|
22 |
+
repo_id (str): Hugging Face repo ID for vectorizer.
|
23 |
+
filename (str): Filename of the vectorizer in the repo.
|
24 |
+
top_k (int, optional): Number of top matches to return. If None, return all.
|
25 |
+
debug (bool, optional): Print raw similarity scores for both and cleaned resume.
|
26 |
+
|
27 |
+
Returns:
|
28 |
+
List[Tuple[str, float]]: List of (resume_filename, score) for top_k matches. and message.
|
29 |
+
"""
|
30 |
+
|
31 |
+
if vectorizer is None:
|
32 |
+
vectorizer = load_tfidf_vectorizer(local_vectorizer_path=local_vectorizer_path,
|
33 |
+
repo_id=repo_id,
|
34 |
+
filename=filename)
|
35 |
|
36 |
cleaned_job_text = clean_text(raw_job_text)
|
37 |
job_vector = vectorizer.transform([cleaned_job_text])
|
|
|
71 |
|
72 |
|
73 |
def rank_with_bert(raw_job_text, raw_resume_texts, *,
|
74 |
+
model=None,
|
75 |
local_bert_path=None,
|
76 |
repo_id="Om-Shandilya/resume-matcher-bert",
|
77 |
top_k=None,
|
78 |
debug=False):
|
79 |
+
"""Rank resumes using BERT embeddings.
|
80 |
+
|
81 |
+
Args:
|
82 |
+
raw_job_text (str): Raw text of the job description.
|
83 |
+
raw_resume_texts (dict): Dictionary of resume filenames and their raw text.
|
84 |
+
model (SentenceTransformer, optional): Preloaded BERT model.
|
85 |
+
local_bert_path (str, optional): Local path to BERT model.
|
86 |
+
repo_id (str): Hugging Face repo ID for model.
|
87 |
+
top_k (int, optional): Maximum number of matches to show. If None, show all.
|
88 |
+
debug (bool, optional): Print raw similarity scores.
|
89 |
+
|
90 |
+
Returns:
|
91 |
+
List[Tuple[str, float]]: List of (resume_filename, score) for top_k matches. and message.
|
92 |
+
"""
|
93 |
+
|
94 |
+
if model is None:
|
95 |
+
model = load_bert_model(local_bert_path=local_bert_path, repo_id=repo_id)
|
96 |
|
97 |
cleaned_job_text = clean_text_for_bert(raw_job_text)
|
98 |
job_embedding = model.encode([cleaned_job_text], normalize_embeddings=True)
|
src/feature_engg/bert_embedding_data.py
CHANGED
@@ -7,11 +7,10 @@ import torch
|
|
7 |
from faiss import read_index
|
8 |
from typing import Optional
|
9 |
from sentence_transformers import SentenceTransformer, models
|
10 |
-
from transformers import AutoTokenizer, AutoModel
|
11 |
from huggingface_hub import hf_hub_download
|
12 |
|
13 |
|
14 |
-
def
|
15 |
device: str = None):
|
16 |
"""
|
17 |
Loads a BERT-based sentence transformer model for embeddings.
|
@@ -84,7 +83,7 @@ def bert_embed_text(df: pd.DataFrame,
|
|
84 |
df[text_column] = df[text_column].fillna("")
|
85 |
|
86 |
if model is None:
|
87 |
-
model =
|
88 |
|
89 |
embeddings = model.encode(
|
90 |
df[text_column].tolist(),
|
|
|
7 |
from faiss import read_index
|
8 |
from typing import Optional
|
9 |
from sentence_transformers import SentenceTransformer, models
|
|
|
10 |
from huggingface_hub import hf_hub_download
|
11 |
|
12 |
|
13 |
+
def create_bert_model(model_name: str,
|
14 |
device: str = None):
|
15 |
"""
|
16 |
Loads a BERT-based sentence transformer model for embeddings.
|
|
|
83 |
df[text_column] = df[text_column].fillna("")
|
84 |
|
85 |
if model is None:
|
86 |
+
model = create_bert_model()
|
87 |
|
88 |
embeddings = model.encode(
|
89 |
df[text_column].tolist(),
|
src/utils/model_loader.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from src.feature_engg.bert_embedding_data import load_bert_model
|
3 |
+
from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer, load_tfidf_matrix
|
4 |
+
from src.feature_engg.bert_embedding_data import load_faiss_index
|
5 |
+
|
6 |
+
# Usinf streamlit's caching mechanism to load models and artifacts only once
|
7 |
+
@st.cache_resource
|
8 |
+
def get_bert_model():
|
9 |
+
"""Loads and caches the BERT model."""
|
10 |
+
|
11 |
+
return load_bert_model(local_bert_path=None,
|
12 |
+
repo_id="Om-Shandilya/resume-matcher-bert")
|
13 |
+
|
14 |
+
@st.cache_resource
|
15 |
+
def get_faiss_index():
|
16 |
+
"""Loads and caches the FAISS index for the applicant view."""
|
17 |
+
|
18 |
+
return load_faiss_index(local_index_path=None,
|
19 |
+
repo_id="Om-Shandilya/resume-matcher-bert",
|
20 |
+
filename="applicant/jobs.faiss")
|
21 |
+
|
22 |
+
@st.cache_resource
|
23 |
+
def get_applicant_vectorizer():
|
24 |
+
"""Loads and caches the TF-IDF vectorizer for the applicant view."""
|
25 |
+
|
26 |
+
return load_tfidf_vectorizer(local_vectorizer_path=None,
|
27 |
+
repo_id="Om-Shandilya/resume-matcher-tfidf",
|
28 |
+
filename="applicant/job_vectorizer.pkl")
|
29 |
+
@st.cache_resource
|
30 |
+
def get_applicant_matrix():
|
31 |
+
"""Loads and caches the TF-IDF matrix for the applicant view."""
|
32 |
+
|
33 |
+
return load_tfidf_matrix(local_matrix_path=None,
|
34 |
+
repo_id="Om-Shandilya/resume-matcher-tfidf",
|
35 |
+
filename="applicant/job_matrix.npz")
|
36 |
+
|
37 |
+
|
38 |
+
@st.cache_resource
|
39 |
+
def get_recruiter_vectorizer():
|
40 |
+
"""Loads and caches the TF-IDF vectorizer for the recruiter view."""
|
41 |
+
|
42 |
+
return load_tfidf_vectorizer(local_vectorizer_path=None,
|
43 |
+
repo_id="Om-Shandilya/resume-matcher-tfidf",
|
44 |
+
filename="recruiter/combined_vectorizer.pkl")
|