Spaces:
Running
Running
Commit
Β·
a9988a0
1
Parent(s):
042558f
Add RESTful API backend and decoupled frontend
Browse files- .gitignore +3 -1
- backend/main.py +120 -0
- backend/models.py +37 -0
- environment.yml +14 -23
- gui/app.py +71 -114
- pipelines/core/applicant.py +30 -5
- src/feature_engg/bert_embedding_data.py +21 -8
- src/processing/text_cleaning.py +3 -0
.gitignore
CHANGED
@@ -216,4 +216,6 @@ data/raw/*/*csv
|
|
216 |
data/saved_plots/
|
217 |
models/
|
218 |
__pycache__/
|
219 |
-
tests/
|
|
|
|
|
|
216 |
data/saved_plots/
|
217 |
models/
|
218 |
__pycache__/
|
219 |
+
tests/
|
220 |
+
download_nltk_data.py
|
221 |
+
nltk_data/
|
backend/main.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException
|
2 |
+
from contextlib import asynccontextmanager
|
3 |
+
import sys
|
4 |
+
import os
|
5 |
+
|
6 |
+
# This ensures that the backend can find your 'src' and 'pipelines' modules and also adds the parent directory to sys.path.
|
7 |
+
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
8 |
+
if PROJECT_ROOT not in sys.path:
|
9 |
+
sys.path.append(PROJECT_ROOT)
|
10 |
+
|
11 |
+
from backend.models import (ResumeRequest, ApplicantResponse, JobMatch,
|
12 |
+
RecruiterRequest, RecruiterResponse, ResumeMatch)
|
13 |
+
from pipelines.core.applicant import run_bert_pipeline, run_tfidf_pipeline, load_job_titles
|
14 |
+
from pipelines.core.recruiter import rank_with_bert, rank_with_tfidf
|
15 |
+
from src.feature_engg.bert_embedding_data import load_bert_model, load_faiss_index
|
16 |
+
from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer, load_tfidf_matrix
|
17 |
+
|
18 |
+
# In memory storage for models (dictionary to hold all loaded models):
|
19 |
+
ml_models = {}
|
20 |
+
|
21 |
+
# Create a lifespan function to handle startup and shutdown events:
|
22 |
+
@asynccontextmanager
|
23 |
+
async def lifespan(app: FastAPI):
|
24 |
+
"""This code runs ONCE when the server starts up."""
|
25 |
+
|
26 |
+
print("π Server starting up: Loading ML models...")
|
27 |
+
|
28 |
+
# Load Applicant Models
|
29 |
+
ml_models["bert_model"] = load_bert_model(local_bert_path=None, repo_id="Om-Shandilya/resume-matcher-bert")
|
30 |
+
ml_models["faiss_index"] = load_faiss_index(local_index_path=None, repo_id="Om-Shandilya/resume-matcher-bert", filename="applicant/jobs.faiss")
|
31 |
+
ml_models["applicant_vectorizer"] = load_tfidf_vectorizer(local_vectorizer_path=None, repo_id="Om-Shandilya/resume-matcher-tfidf", filename="applicant/job_vectorizer.pkl")
|
32 |
+
ml_models["applicant_matrix"] = load_tfidf_matrix(local_matrix_path=None, repo_id="Om-Shandilya/resume-matcher-tfidf", filename="applicant/job_matrix.npz")
|
33 |
+
|
34 |
+
# Load Recruiter Models
|
35 |
+
ml_models["recruiter_vectorizer"] = load_tfidf_vectorizer(local_vectorizer_path=None, repo_id="Om-Shandilya/resume-matcher-tfidf", filename="recruiter/combined_vectorizer.pkl")
|
36 |
+
|
37 |
+
# Load Job Titles DataFrames
|
38 |
+
ml_models["tfidf_job_df"] = load_job_titles(repo_id='Om-Shandilya/resume-matcher-tfidf', filename='applicant/tfidf_job_titles.csv')
|
39 |
+
ml_models["bert_job_df"] = load_job_titles(repo_id='Om-Shandilya/resume-matcher-bert', filename='applicant/bert_job_titles.csv')
|
40 |
+
|
41 |
+
print("β
ML models loaded successfully.")
|
42 |
+
|
43 |
+
yield
|
44 |
+
|
45 |
+
# This code runs once when the server is shutting down.
|
46 |
+
print(" shutting down: Clearing ML models...")
|
47 |
+
ml_models.clear()
|
48 |
+
|
49 |
+
# Initializing the FastAPI app
|
50 |
+
app = FastAPI(
|
51 |
+
title="Resume-Job Matcher API",
|
52 |
+
description="An API for matching resumes to jobs and ranking candidates.",
|
53 |
+
lifespan=lifespan
|
54 |
+
)
|
55 |
+
|
56 |
+
# Creating the API endpoints:
|
57 |
+
@app.get("/")
|
58 |
+
def read_root():
|
59 |
+
return {"status": "Resume Matcher API is running."}
|
60 |
+
|
61 |
+
# Applicant side endpoints:
|
62 |
+
@app.post("/applicant/match/bert", response_model=ApplicantResponse)
|
63 |
+
async def match_resume_bert(request: ResumeRequest):
|
64 |
+
try:
|
65 |
+
matches, message = run_bert_pipeline(
|
66 |
+
raw_resume=request.raw_text,
|
67 |
+
model=ml_models["bert_model"],
|
68 |
+
job_index=ml_models["faiss_index"],
|
69 |
+
job_df=ml_models["bert_job_df"],
|
70 |
+
top_k=request.top_k)
|
71 |
+
|
72 |
+
response_matches = [JobMatch(job_title=title, match_score=score) for title, score in matches]
|
73 |
+
return ApplicantResponse(matches=response_matches, message=message)
|
74 |
+
except Exception as e:
|
75 |
+
raise HTTPException(status_code=500, detail=str(e))
|
76 |
+
|
77 |
+
@app.post("/applicant/match/tf-idf", response_model=ApplicantResponse)
|
78 |
+
async def match_resume_tfidf(request: ResumeRequest):
|
79 |
+
try:
|
80 |
+
matches, message = run_tfidf_pipeline(
|
81 |
+
raw_resume=request.raw_text,
|
82 |
+
vectorizer=ml_models["applicant_vectorizer"],
|
83 |
+
job_matrix=ml_models["applicant_matrix"],
|
84 |
+
job_df=ml_models["tfidf_job_df"],
|
85 |
+
top_k=request.top_k)
|
86 |
+
|
87 |
+
response_matches = [JobMatch(job_title=title, match_score=score) for title, score in matches]
|
88 |
+
return ApplicantResponse(matches=response_matches, message=message)
|
89 |
+
except Exception as e:
|
90 |
+
raise HTTPException(status_code=500, detail=str(e))
|
91 |
+
|
92 |
+
|
93 |
+
# Recruiter side endpoints:
|
94 |
+
@app.post("/recruiter/rank/bert", response_model=RecruiterResponse)
|
95 |
+
async def rank_resumes_bert(request: RecruiterRequest):
|
96 |
+
try:
|
97 |
+
matches, message = rank_with_bert(
|
98 |
+
raw_job_text=request.raw_job_text,
|
99 |
+
raw_resume_texts=request.raw_resume_texts,
|
100 |
+
model=ml_models["bert_model"],
|
101 |
+
top_k=request.top_k)
|
102 |
+
|
103 |
+
response_matches = [ResumeMatch(resume_filename=fname, match_score=score) for fname, score in matches]
|
104 |
+
return RecruiterResponse(matches=response_matches, message=message)
|
105 |
+
except Exception as e:
|
106 |
+
raise HTTPException(status_code=500, detail=str(e))
|
107 |
+
|
108 |
+
@app.post("/recruiter/rank/tf-idf", response_model=RecruiterResponse)
|
109 |
+
async def rank_resumes_tfidf(request: RecruiterRequest):
|
110 |
+
try:
|
111 |
+
matches, message = rank_with_tfidf(
|
112 |
+
raw_job_text=request.raw_job_text,
|
113 |
+
raw_resume_texts=request.raw_resume_texts,
|
114 |
+
vectorizer=ml_models["recruiter_vectorizer"],
|
115 |
+
top_k=request.top_k)
|
116 |
+
|
117 |
+
response_matches = [ResumeMatch(resume_filename=fname, match_score=score) for fname, score in matches]
|
118 |
+
return RecruiterResponse(matches=response_matches, message=message)
|
119 |
+
except Exception as e:
|
120 |
+
raise HTTPException(status_code=500, detail=str(e))
|
backend/models.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel
|
2 |
+
from typing import List, Dict
|
3 |
+
|
4 |
+
|
5 |
+
# Applicant Side Models:
|
6 |
+
class ResumeRequest(BaseModel):
|
7 |
+
"""The request body for matching a single resume."""
|
8 |
+
raw_text: str
|
9 |
+
top_k: int | None = None
|
10 |
+
|
11 |
+
class JobMatch(BaseModel):
|
12 |
+
"""Represents a single job match with its score."""
|
13 |
+
job_title: str
|
14 |
+
match_score: float
|
15 |
+
|
16 |
+
class ApplicantResponse(BaseModel):
|
17 |
+
"""The response body containing job matches and a message."""
|
18 |
+
matches: List[JobMatch]
|
19 |
+
message: str
|
20 |
+
|
21 |
+
|
22 |
+
# Recruiter Side Models:
|
23 |
+
class RecruiterRequest(BaseModel):
|
24 |
+
"""The request body for ranking multiple resumes against a job description."""
|
25 |
+
raw_job_text: str
|
26 |
+
raw_resume_texts: Dict[str, str] # dict of {filename: raw_resume_text}
|
27 |
+
top_k: int | None = None
|
28 |
+
|
29 |
+
class ResumeMatch(BaseModel):
|
30 |
+
"""Represents a single ranked resume with its score."""
|
31 |
+
resume_filename: str
|
32 |
+
match_score: float
|
33 |
+
|
34 |
+
class RecruiterResponse(BaseModel):
|
35 |
+
"""The response body containing ranked resumes and a message."""
|
36 |
+
matches: List[ResumeMatch]
|
37 |
+
message: str
|
environment.yml
CHANGED
@@ -1,38 +1,29 @@
|
|
1 |
-
# Tested on Windows 11 with NVIDIA GPU (CUDA driver 13.0, PyTorch CUDA 12.1 build)
|
2 |
-
# Use: conda env create -f environment.yml
|
3 |
-
|
4 |
name: resume-matcher
|
5 |
channels:
|
|
|
6 |
- conda-forge
|
7 |
- defaults
|
8 |
dependencies:
|
9 |
- python=3.10
|
10 |
- pip
|
11 |
-
# Core
|
12 |
-
-
|
|
|
|
|
|
|
13 |
- pandas
|
14 |
- scikit-learn
|
15 |
-
- scipy
|
16 |
- joblib
|
17 |
-
-
|
18 |
-
- matplotlib
|
19 |
-
- seaborn
|
20 |
-
- wordcloud
|
21 |
-
# NLP / ML essentials
|
22 |
- faiss-cpu
|
23 |
-
- nltk
|
24 |
-
- statsmodels
|
25 |
- huggingface_hub
|
26 |
-
# File handling
|
27 |
-
- openpyxl
|
28 |
-
- lxml
|
29 |
-
- pillow
|
30 |
-
- pyyaml
|
31 |
-
- python-docx
|
32 |
- pdfminer.six
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
34 |
- pip:
|
35 |
- sentence-transformers
|
36 |
-
-
|
37 |
-
- accelerate
|
38 |
-
- datasets
|
|
|
|
|
|
|
|
|
1 |
name: resume-matcher
|
2 |
channels:
|
3 |
+
- pytorch
|
4 |
- conda-forge
|
5 |
- defaults
|
6 |
dependencies:
|
7 |
- python=3.10
|
8 |
- pip
|
9 |
+
# --- Core Application Dependencies ---
|
10 |
+
- fastapi
|
11 |
+
- uvicorn
|
12 |
+
- streamlit
|
13 |
+
- altair
|
14 |
- pandas
|
15 |
- scikit-learn
|
|
|
16 |
- joblib
|
17 |
+
- pytorch
|
|
|
|
|
|
|
|
|
18 |
- faiss-cpu
|
|
|
|
|
19 |
- huggingface_hub
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
- pdfminer.six
|
21 |
+
- python-docx
|
22 |
+
- requests
|
23 |
+
- nltk
|
24 |
+
- scipy
|
25 |
+
- anyio
|
26 |
+
# --- Pip-only packages ---
|
27 |
- pip:
|
28 |
- sentence-transformers
|
29 |
+
- python-multipart
|
|
|
|
gui/app.py
CHANGED
@@ -1,29 +1,34 @@
|
|
1 |
import streamlit as st
|
2 |
import os
|
|
|
3 |
import tempfile
|
4 |
import pandas as pd
|
5 |
import shutil
|
6 |
-
import sys
|
7 |
import altair as alt
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
9 |
from src.utils.bulk_loading import bulk_load_raw_resume_files
|
10 |
from src.utils.file_reader import extract_text_from_file
|
11 |
-
from src.utils.model_loader import get_applicant_matrix, get_applicant_vectorizer, get_bert_model, get_faiss_index, get_recruiter_vectorizer
|
12 |
-
from pipelines.core.applicant import run_tfidf_pipeline as applicant_tfidf, run_bert_pipeline as applicant_bert
|
13 |
-
from pipelines.core.recruiter import rank_with_tfidf as recruiter_tfidf, rank_with_bert as recruiter_bert
|
14 |
|
15 |
-
#
|
|
|
|
|
|
|
16 |
st.set_page_config(
|
17 |
page_title="Resume-Job Matcher",
|
18 |
-
page_icon="
|
19 |
layout="wide"
|
20 |
)
|
21 |
|
22 |
-
#
|
23 |
st.title("π― AI-Powered Resume-Job Matcher")
|
24 |
st.write("---")
|
25 |
|
26 |
-
#
|
27 |
with st.sidebar:
|
28 |
st.header("Controls")
|
29 |
app_mode = st.radio(
|
@@ -34,36 +39,28 @@ with st.sidebar:
|
|
34 |
model_choice = st.selectbox(
|
35 |
"Choose the AI Model",
|
36 |
("TF-IDF", "BERT"),
|
37 |
-
help="TF-IDF is
|
38 |
)
|
39 |
-
|
40 |
st.write("---")
|
41 |
-
|
42 |
-
# Add a checkbox to control the 'show all' feature
|
43 |
show_all = st.checkbox("Show all matches", value=False)
|
44 |
-
|
45 |
if show_all:
|
46 |
top_k = None
|
47 |
-
# Disable the slider when 'show_all' is checked for better UX
|
48 |
st.slider(
|
49 |
-
"Number of matches to show",
|
50 |
min_value=1, max_value=50, value=5, step=1,
|
51 |
disabled=True
|
52 |
)
|
53 |
st.info("Showing all ranked results.")
|
54 |
else:
|
55 |
-
# Enable the slider when 'show_all' is unchecked
|
56 |
top_k = st.slider(
|
57 |
-
"Number of matches to show",
|
58 |
min_value=1, max_value=50, value=5, step=1,
|
59 |
disabled=False
|
60 |
)
|
61 |
|
62 |
-
|
63 |
-
# --- Applicant View ---
|
64 |
if app_mode == "Applicant":
|
65 |
st.header("Applicant: Match Your Resume to a Job")
|
66 |
-
|
67 |
resume_file = st.file_uploader(
|
68 |
"Upload your resume",
|
69 |
type=['pdf', 'docx', 'txt'],
|
@@ -73,150 +70,110 @@ if app_mode == "Applicant":
|
|
73 |
if resume_file:
|
74 |
st.success(f"β
Successfully uploaded `{resume_file.name}`")
|
75 |
if st.button("Find Top Job Matches", type="primary", width='stretch'):
|
76 |
-
|
77 |
-
with st.spinner(f"Analyzing resume with {model_choice}..."):
|
78 |
-
|
79 |
tmp_file_path = None
|
80 |
try:
|
81 |
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(resume_file.name)[1]) as tmp_file:
|
82 |
tmp_file.write(resume_file.getvalue())
|
83 |
tmp_file_path = tmp_file.name
|
84 |
-
|
85 |
raw_resume_text = extract_text_from_file(tmp_file_path)
|
86 |
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
applicant_vectorizer = get_applicant_vectorizer()
|
97 |
-
applicant_matrix = get_applicant_matrix()
|
98 |
-
matches, message = applicant_tfidf(raw_resume_text,
|
99 |
-
vectorizer=applicant_vectorizer,
|
100 |
-
job_matrix=applicant_matrix,
|
101 |
-
top_k=top_k)
|
102 |
|
103 |
if not matches:
|
104 |
st.warning("β οΈ No suitable job matches found.")
|
105 |
else:
|
106 |
-
st.subheader(f"Top {len(matches)} Job Matches:")
|
107 |
st.info(message)
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
df = df.sort_values(by="Match Score", ascending=False).reset_index(drop=True)
|
113 |
|
114 |
chart = alt.Chart(df).mark_bar().encode(
|
115 |
-
y=alt.Y('
|
116 |
-
x=alt.X('
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
).properties(
|
121 |
-
# Set a responsive title for the chart to indicate what the bars represent
|
122 |
-
title="Relative Job Match Scores"
|
123 |
-
).interactive()
|
124 |
-
|
125 |
st.altair_chart(chart, use_container_width=True)
|
126 |
|
|
|
|
|
127 |
except Exception as e:
|
128 |
st.error(f"An error occurred: {e}")
|
129 |
-
|
130 |
finally:
|
131 |
if tmp_file_path and os.path.exists(tmp_file_path):
|
132 |
os.unlink(tmp_file_path)
|
133 |
|
134 |
-
|
135 |
-
# --- Recruiter View ---
|
136 |
if app_mode == "Recruiter":
|
137 |
st.header("Recruiter: Rank Resumes for a Job Description")
|
138 |
-
|
139 |
-
|
140 |
-
"Upload the job description",
|
141 |
-
type=['pdf', 'docx', 'txt'],
|
142 |
-
help="Upload the job description in PDF, DOCX, or TXT format."
|
143 |
-
)
|
144 |
-
|
145 |
-
resume_files = st.file_uploader(
|
146 |
-
"Upload candidate resumes",
|
147 |
-
type=['pdf', 'docx', 'txt'],
|
148 |
-
accept_multiple_files=True,
|
149 |
-
help="Upload one or more resumes."
|
150 |
-
)
|
151 |
|
152 |
if job_desc_file and resume_files:
|
153 |
-
st.success(f"β
Successfully uploaded job description
|
154 |
if st.button("Rank Resumes", type="primary", width='stretch'):
|
155 |
-
|
156 |
-
with st.spinner(f"Ranking {len(resume_files)} resumes with {model_choice}..."):
|
157 |
-
|
158 |
-
# Paths for cleanup in the finally block
|
159 |
temp_dir = None
|
160 |
job_desc_path = None
|
161 |
-
|
162 |
try:
|
163 |
-
# 1. Handle the single job description file
|
164 |
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(job_desc_file.name)[1]) as tmp_file:
|
165 |
tmp_file.write(job_desc_file.getvalue())
|
166 |
job_desc_path = tmp_file.name
|
167 |
raw_job_text = extract_text_from_file(job_desc_path)
|
168 |
|
169 |
-
# 2. Handle multiple resume files by creating a temp directory for bulk loading
|
170 |
temp_dir = tempfile.mkdtemp()
|
171 |
for resume_file in resume_files:
|
172 |
-
|
173 |
-
with open(resume_path, "wb") as f:
|
174 |
f.write(resume_file.getbuffer())
|
175 |
-
|
176 |
-
# Bulk loading all resumes from the temp directory
|
177 |
raw_resume_texts = bulk_load_raw_resume_files(temp_dir)
|
178 |
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
top_k=top_k)
|
192 |
|
193 |
-
# 4. Display results
|
194 |
if not ranked_resumes:
|
195 |
st.warning("β οΈ Could not rank resumes. Please check the files.")
|
196 |
else:
|
197 |
-
st.subheader(f"Top {len(ranked_resumes)} Ranked Resumes:")
|
198 |
st.info(message)
|
199 |
-
|
200 |
-
|
201 |
-
df["
|
202 |
st.dataframe(
|
203 |
df,
|
204 |
-
column_config={
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
hide_index=True,
|
212 |
)
|
213 |
|
|
|
|
|
214 |
except Exception as e:
|
215 |
-
st.error(f"
|
216 |
-
|
217 |
finally:
|
218 |
-
# 5. Clean up all temporary files and the directory
|
219 |
if job_desc_path and os.path.exists(job_desc_path):
|
220 |
os.unlink(job_desc_path)
|
221 |
if temp_dir and os.path.exists(temp_dir):
|
222 |
-
shutil.rmtree(temp_dir)
|
|
|
1 |
import streamlit as st
|
2 |
import os
|
3 |
+
import sys
|
4 |
import tempfile
|
5 |
import pandas as pd
|
6 |
import shutil
|
|
|
7 |
import altair as alt
|
8 |
+
import requests # Import for making API requests
|
9 |
+
|
10 |
+
# Ensure the parent directory is in sys.path for imports
|
11 |
+
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
12 |
+
if PROJECT_ROOT not in sys.path:
|
13 |
+
sys.path.append(PROJECT_ROOT)
|
14 |
from src.utils.bulk_loading import bulk_load_raw_resume_files
|
15 |
from src.utils.file_reader import extract_text_from_file
|
|
|
|
|
|
|
16 |
|
17 |
+
# Configuring the backend API URL
|
18 |
+
API_URL = "http://127.0.0.1:8000"
|
19 |
+
|
20 |
+
# Configuring the Streamlit app
|
21 |
st.set_page_config(
|
22 |
page_title="Resume-Job Matcher",
|
23 |
+
page_icon="π¨βπΌ",
|
24 |
layout="wide"
|
25 |
)
|
26 |
|
27 |
+
# Main app title and description
|
28 |
st.title("π― AI-Powered Resume-Job Matcher")
|
29 |
st.write("---")
|
30 |
|
31 |
+
# Creating sidebar for controls
|
32 |
with st.sidebar:
|
33 |
st.header("Controls")
|
34 |
app_mode = st.radio(
|
|
|
39 |
model_choice = st.selectbox(
|
40 |
"Choose the AI Model",
|
41 |
("TF-IDF", "BERT"),
|
42 |
+
help="TF-IDF is baseline. BERT is more accurate and semantic."
|
43 |
)
|
|
|
44 |
st.write("---")
|
|
|
|
|
45 |
show_all = st.checkbox("Show all matches", value=False)
|
|
|
46 |
if show_all:
|
47 |
top_k = None
|
|
|
48 |
st.slider(
|
49 |
+
"Number of matches to show",
|
50 |
min_value=1, max_value=50, value=5, step=1,
|
51 |
disabled=True
|
52 |
)
|
53 |
st.info("Showing all ranked results.")
|
54 |
else:
|
|
|
55 |
top_k = st.slider(
|
56 |
+
"Number of matches to show",
|
57 |
min_value=1, max_value=50, value=5, step=1,
|
58 |
disabled=False
|
59 |
)
|
60 |
|
61 |
+
# Applicant view of the app
|
|
|
62 |
if app_mode == "Applicant":
|
63 |
st.header("Applicant: Match Your Resume to a Job")
|
|
|
64 |
resume_file = st.file_uploader(
|
65 |
"Upload your resume",
|
66 |
type=['pdf', 'docx', 'txt'],
|
|
|
70 |
if resume_file:
|
71 |
st.success(f"β
Successfully uploaded `{resume_file.name}`")
|
72 |
if st.button("Find Top Job Matches", type="primary", width='stretch'):
|
73 |
+
with st.spinner(f"Sending your resume to the AI backend for matching..."):
|
|
|
|
|
74 |
tmp_file_path = None
|
75 |
try:
|
76 |
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(resume_file.name)[1]) as tmp_file:
|
77 |
tmp_file.write(resume_file.getvalue())
|
78 |
tmp_file_path = tmp_file.name
|
|
|
79 |
raw_resume_text = extract_text_from_file(tmp_file_path)
|
80 |
|
81 |
+
endpoint = f"{API_URL}/applicant/match/{model_choice.lower()}"
|
82 |
+
payload = {"raw_text": raw_resume_text, "top_k": top_k}
|
83 |
+
|
84 |
+
response = requests.post(endpoint, json=payload, timeout=180) # 3-minute timeout
|
85 |
+
response.raise_for_status() # Raises HTTPError for bad responses eg. 4xx, 5xx
|
86 |
+
|
87 |
+
api_data = response.json()
|
88 |
+
matches = api_data.get("matches", [])
|
89 |
+
message = api_data.get("message", "No message from server.")
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
if not matches:
|
92 |
st.warning("β οΈ No suitable job matches found.")
|
93 |
else:
|
|
|
94 |
st.info(message)
|
95 |
+
st.subheader(f"Top {len(matches)} Job Matches:")
|
96 |
+
|
97 |
+
df = pd.DataFrame(matches) # Pandas handles list of dicts perfectly
|
98 |
+
df = df.sort_values(by="match_score", ascending=False).reset_index(drop=True)
|
|
|
99 |
|
100 |
chart = alt.Chart(df).mark_bar().encode(
|
101 |
+
y=alt.Y('job_title', sort='-x', title=None, axis=alt.Axis(labelLimit=400)),
|
102 |
+
x=alt.X('match_score', axis=None, scale=alt.Scale(domainMin=0)),
|
103 |
+
tooltip=['job_title', alt.Tooltip('match_score', format='.3f')]
|
104 |
+
).properties(title="Relative Job Match Scores").interactive()
|
105 |
+
|
|
|
|
|
|
|
|
|
|
|
106 |
st.altair_chart(chart, use_container_width=True)
|
107 |
|
108 |
+
except requests.exceptions.RequestException as e:
|
109 |
+
st.error(f"API Error: Could not connect to the backend. Please ensure the backend server is running. Details: {e}")
|
110 |
except Exception as e:
|
111 |
st.error(f"An error occurred: {e}")
|
|
|
112 |
finally:
|
113 |
if tmp_file_path and os.path.exists(tmp_file_path):
|
114 |
os.unlink(tmp_file_path)
|
115 |
|
116 |
+
# Recruiter view of the app
|
|
|
117 |
if app_mode == "Recruiter":
|
118 |
st.header("Recruiter: Rank Resumes for a Job Description")
|
119 |
+
job_desc_file = st.file_uploader("Upload the job description", type=['pdf', 'docx', 'txt'])
|
120 |
+
resume_files = st.file_uploader("Upload candidate resumes", type=['pdf', 'docx', 'txt'], accept_multiple_files=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
if job_desc_file and resume_files:
|
123 |
+
st.success(f"β
Successfully uploaded job description and {len(resume_files)} resumes.")
|
124 |
if st.button("Rank Resumes", type="primary", width='stretch'):
|
125 |
+
with st.spinner(f"Sending files to the AI backend for ranking..."):
|
|
|
|
|
|
|
126 |
temp_dir = None
|
127 |
job_desc_path = None
|
|
|
128 |
try:
|
|
|
129 |
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(job_desc_file.name)[1]) as tmp_file:
|
130 |
tmp_file.write(job_desc_file.getvalue())
|
131 |
job_desc_path = tmp_file.name
|
132 |
raw_job_text = extract_text_from_file(job_desc_path)
|
133 |
|
|
|
134 |
temp_dir = tempfile.mkdtemp()
|
135 |
for resume_file in resume_files:
|
136 |
+
with open(os.path.join(temp_dir, resume_file.name), "wb") as f:
|
|
|
137 |
f.write(resume_file.getbuffer())
|
|
|
|
|
138 |
raw_resume_texts = bulk_load_raw_resume_files(temp_dir)
|
139 |
|
140 |
+
endpoint = f"{API_URL}/recruiter/rank/{model_choice.lower()}"
|
141 |
+
payload = {
|
142 |
+
"raw_job_text": raw_job_text,
|
143 |
+
"raw_resume_texts": raw_resume_texts,
|
144 |
+
"top_k": top_k
|
145 |
+
}
|
146 |
+
response = requests.post(endpoint, json=payload, timeout=300) # 5-minute timeout
|
147 |
+
response.raise_for_status() # Raises HTTPError for bad responses eg. 4xx, 5xx
|
148 |
+
|
149 |
+
api_data = response.json()
|
150 |
+
ranked_resumes = api_data.get("matches", [])
|
151 |
+
message = api_data.get("message", "No message from server.")
|
|
|
152 |
|
|
|
153 |
if not ranked_resumes:
|
154 |
st.warning("β οΈ Could not rank resumes. Please check the files.")
|
155 |
else:
|
|
|
156 |
st.info(message)
|
157 |
+
st.subheader(f"Top {len(ranked_resumes)} Ranked Resumes:")
|
158 |
+
df = pd.DataFrame(ranked_resumes)
|
159 |
+
df["match_score"] = df["match_score"].apply(lambda x: min(1.0, x))
|
160 |
st.dataframe(
|
161 |
df,
|
162 |
+
column_config={
|
163 |
+
"resume_filename": st.column_config.TextColumn("Resume"),
|
164 |
+
"match_score": st.column_config.ProgressColumn(
|
165 |
+
"Match Score", format="%.2f", min_value=0, max_value=1
|
166 |
+
),
|
167 |
+
},
|
168 |
+
hide_index=True,
|
|
|
169 |
)
|
170 |
|
171 |
+
except requests.exceptions.RequestException as e:
|
172 |
+
st.error(f"API Error: Could not connect to the backend. Please ensure the backend server is running. Details: {e}")
|
173 |
except Exception as e:
|
174 |
+
st.error(f"An error occurred: {e}")
|
|
|
175 |
finally:
|
|
|
176 |
if job_desc_path and os.path.exists(job_desc_path):
|
177 |
os.unlink(job_desc_path)
|
178 |
if temp_dir and os.path.exists(temp_dir):
|
179 |
+
shutil.rmtree(temp_dir)
|
pipelines/core/applicant.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
import pandas as pd
|
|
|
2 |
from pathlib import Path
|
|
|
3 |
from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer, load_tfidf_matrix
|
4 |
from src.feature_engg.bert_embedding_data import load_bert_model, load_faiss_index
|
5 |
from src.processing.text_cleaning import clean_text, clean_text_for_bert
|
@@ -7,8 +9,22 @@ from src.matching.matching_engine import compute_similarity_matrix, top_n_tfidf_
|
|
7 |
|
8 |
# Defining paths for data files
|
9 |
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
10 |
-
def load_job_titles(
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
if "title" not in df.columns:
|
13 |
raise ValueError("Job CSV must contain a 'title' column.")
|
14 |
return df
|
@@ -16,6 +32,7 @@ def load_job_titles(job_csv_path: str):
|
|
16 |
def run_tfidf_pipeline(raw_resume: str, *,
|
17 |
vectorizer=None,
|
18 |
job_matrix=None,
|
|
|
19 |
local_vectorizer_path=None,
|
20 |
local_matrix_path=None,
|
21 |
repo_id="Om-Shandilya/resume-matcher-tfidf",
|
@@ -28,7 +45,8 @@ def run_tfidf_pipeline(raw_resume: str, *,
|
|
28 |
Args:
|
29 |
raw_resume (str): Raw text of the resume.
|
30 |
vectorizer (TfidfVectorizer, optional): Preloaded TF-IDF vectorizer.
|
31 |
-
job_matrix (scipy.sparse matrix, optional): Preloaded TF-IDF job matrix
|
|
|
32 |
local_vectorizer_path (str, optional): Local path to TF-IDF vectorizer.
|
33 |
local_matrix_path (str, optional): Local path to TF-IDF matrix.
|
34 |
repo_id (str): Hugging Face repo ID for vectorizer/matrix.
|
@@ -51,7 +69,9 @@ def run_tfidf_pipeline(raw_resume: str, *,
|
|
51 |
resume_vector = vectorizer.transform([cleaned_resume])
|
52 |
sim_matrix = compute_similarity_matrix(resume_vector, job_matrix)
|
53 |
|
54 |
-
job_df
|
|
|
|
|
55 |
total_jobs = len(job_df['title'].unique())
|
56 |
|
57 |
message = ""
|
@@ -81,6 +101,7 @@ def run_tfidf_pipeline(raw_resume: str, *,
|
|
81 |
def run_bert_pipeline(raw_resume: str, *,
|
82 |
model=None,
|
83 |
job_index=None,
|
|
|
84 |
local_bert_path=None,
|
85 |
local_index_path=None,
|
86 |
repo_id="Om-Shandilya/resume-matcher-bert",
|
@@ -93,6 +114,7 @@ def run_bert_pipeline(raw_resume: str, *,
|
|
93 |
raw_resume (str): Raw text of the resume.
|
94 |
model (SentenceTransformer, optional): Preloaded BERT model.
|
95 |
job_index (faiss.Index, optional): Preloaded FAISS index.
|
|
|
96 |
local_bert_path (str, optional): Local path to BERT model.
|
97 |
local_index_path (str, optional): Local path to FAISS index.
|
98 |
repo_id (str): Hugging Face repo ID for model/index.
|
@@ -113,7 +135,10 @@ def run_bert_pipeline(raw_resume: str, *,
|
|
113 |
resume_embedding = model.encode([cleaned_resume], normalize_embeddings=True)
|
114 |
|
115 |
D, I = job_index.search(resume_embedding, job_index.ntotal)
|
116 |
-
|
|
|
|
|
|
|
117 |
total_jobs = len(job_df['title'].unique())
|
118 |
|
119 |
message = ""
|
|
|
1 |
import pandas as pd
|
2 |
+
import os
|
3 |
from pathlib import Path
|
4 |
+
from huggingface_hub import hf_hub_download
|
5 |
from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer, load_tfidf_matrix
|
6 |
from src.feature_engg.bert_embedding_data import load_bert_model, load_faiss_index
|
7 |
from src.processing.text_cleaning import clean_text, clean_text_for_bert
|
|
|
9 |
|
10 |
# Defining paths for data files
|
11 |
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
12 |
+
def load_job_titles(local_path=None, repo_id=None, filename=None):
|
13 |
+
"""
|
14 |
+
Load job titles, preferring a local path if provided, otherwise
|
15 |
+
downloading from the Hugging Face Hub.
|
16 |
+
"""
|
17 |
+
file_path = ""
|
18 |
+
if local_path and os.path.exists(local_path):
|
19 |
+
print(f"π Using local job titles from {local_path}")
|
20 |
+
file_path = local_path
|
21 |
+
elif repo_id and filename:
|
22 |
+
print(f"π Downloading job titles from Hugging Face Hub ({repo_id}/{filename})")
|
23 |
+
file_path = hf_hub_download(repo_id=repo_id, filename=filename)
|
24 |
+
else:
|
25 |
+
raise ValueError("Must provide either a valid local_path or repo_id and filename.")
|
26 |
+
|
27 |
+
df = pd.read_csv(file_path)
|
28 |
if "title" not in df.columns:
|
29 |
raise ValueError("Job CSV must contain a 'title' column.")
|
30 |
return df
|
|
|
32 |
def run_tfidf_pipeline(raw_resume: str, *,
|
33 |
vectorizer=None,
|
34 |
job_matrix=None,
|
35 |
+
job_df=None,
|
36 |
local_vectorizer_path=None,
|
37 |
local_matrix_path=None,
|
38 |
repo_id="Om-Shandilya/resume-matcher-tfidf",
|
|
|
45 |
Args:
|
46 |
raw_resume (str): Raw text of the resume.
|
47 |
vectorizer (TfidfVectorizer, optional): Preloaded TF-IDF vectorizer.
|
48 |
+
job_matrix (scipy.sparse matrix, optional): Preloaded TF-IDF job matrix.
|
49 |
+
job_df (pd.DataFrame, optional): DataFrame of job titles.
|
50 |
local_vectorizer_path (str, optional): Local path to TF-IDF vectorizer.
|
51 |
local_matrix_path (str, optional): Local path to TF-IDF matrix.
|
52 |
repo_id (str): Hugging Face repo ID for vectorizer/matrix.
|
|
|
69 |
resume_vector = vectorizer.transform([cleaned_resume])
|
70 |
sim_matrix = compute_similarity_matrix(resume_vector, job_matrix)
|
71 |
|
72 |
+
if job_df is None:
|
73 |
+
job_df = load_job_titles(repo_id='Om-Shandilya/resume-matcher-tfidf', filename='applicant/tfidf_job_titles.csv')
|
74 |
+
|
75 |
total_jobs = len(job_df['title'].unique())
|
76 |
|
77 |
message = ""
|
|
|
101 |
def run_bert_pipeline(raw_resume: str, *,
|
102 |
model=None,
|
103 |
job_index=None,
|
104 |
+
job_df=None,
|
105 |
local_bert_path=None,
|
106 |
local_index_path=None,
|
107 |
repo_id="Om-Shandilya/resume-matcher-bert",
|
|
|
114 |
raw_resume (str): Raw text of the resume.
|
115 |
model (SentenceTransformer, optional): Preloaded BERT model.
|
116 |
job_index (faiss.Index, optional): Preloaded FAISS index.
|
117 |
+
job_df (pd.DataFrame, optional): DataFrame of job titles.
|
118 |
local_bert_path (str, optional): Local path to BERT model.
|
119 |
local_index_path (str, optional): Local path to FAISS index.
|
120 |
repo_id (str): Hugging Face repo ID for model/index.
|
|
|
135 |
resume_embedding = model.encode([cleaned_resume], normalize_embeddings=True)
|
136 |
|
137 |
D, I = job_index.search(resume_embedding, job_index.ntotal)
|
138 |
+
|
139 |
+
if job_df is None:
|
140 |
+
job_df = load_job_titles(repo_id='Om-Shandilya/resume-matcher-bert', filename='applicant/bert_job_titles.csv')
|
141 |
+
|
142 |
total_jobs = len(job_df['title'].unique())
|
143 |
|
144 |
message = ""
|
src/feature_engg/bert_embedding_data.py
CHANGED
@@ -100,17 +100,30 @@ def bert_embed_text(df: pd.DataFrame,
|
|
100 |
return embeddings, model
|
101 |
|
102 |
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
if local_index_path:
|
106 |
if not os.path.exists(local_index_path):
|
107 |
raise FileNotFoundError(f"β Local FAISS index not found at {local_index_path}")
|
108 |
-
print(f"π
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
def load_bert_model(local_bert_path: str, repo_id: str='Om-Shandilya/resume-matcher-bert'):
|
116 |
"""
|
|
|
100 |
return embeddings, model
|
101 |
|
102 |
|
103 |
+
import faiss
|
104 |
+
import os
|
105 |
+
from huggingface_hub import hf_hub_download
|
106 |
+
|
107 |
+
def load_faiss_index(local_index_path: str, repo_id: str, filename: str, lazy_loading: bool = True):
|
108 |
+
"""
|
109 |
+
Load FAISS index, preferring local then HF Hub. Applies lazy loading by default.
|
110 |
+
"""
|
111 |
+
index_path = ""
|
112 |
if local_index_path:
|
113 |
if not os.path.exists(local_index_path):
|
114 |
raise FileNotFoundError(f"β Local FAISS index not found at {local_index_path}")
|
115 |
+
print(f"π Using local FAISS index from {local_index_path}")
|
116 |
+
index_path = local_index_path
|
117 |
+
else:
|
118 |
+
print(f"π Downloading FAISS index from Hugging Face Hub ({repo_id}/{filename})")
|
119 |
+
index_path = hf_hub_download(repo_id=repo_id, filename=filename)
|
120 |
+
|
121 |
+
if lazy_loading:
|
122 |
+
print(" -> Loading with lazy loading (MMAP).")
|
123 |
+
return faiss.read_index(index_path, faiss.IO_FLAG_MMAP)
|
124 |
+
else:
|
125 |
+
print(" -> Loading into memory directly.")
|
126 |
+
return faiss.read_index(index_path)
|
127 |
|
128 |
def load_bert_model(local_bert_path: str, repo_id: str='Om-Shandilya/resume-matcher-bert'):
|
129 |
"""
|
src/processing/text_cleaning.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import re
|
|
|
2 |
import string
|
3 |
import pandas as pd
|
4 |
from typing import Optional
|
@@ -9,6 +10,8 @@ from nltk.corpus import stopwords
|
|
9 |
from nltk.stem import WordNetLemmatizer
|
10 |
import unicodedata
|
11 |
|
|
|
|
|
12 |
# Only Download necessary NLTK resources if not already present
|
13 |
nltk_packages = {
|
14 |
"stopwords": "corpora/stopwords",
|
|
|
1 |
import re
|
2 |
+
import os
|
3 |
import string
|
4 |
import pandas as pd
|
5 |
from typing import Optional
|
|
|
10 |
from nltk.stem import WordNetLemmatizer
|
11 |
import unicodedata
|
12 |
|
13 |
+
nltk.data.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'nltk_data'))
|
14 |
+
|
15 |
# Only Download necessary NLTK resources if not already present
|
16 |
nltk_packages = {
|
17 |
"stopwords": "corpora/stopwords",
|