Om-Shandilya commited on
Commit
0ad99b7
Β·
1 Parent(s): 92feaec

Add GUI and Refactor Pipelines for reusability

Browse files
.gitignore CHANGED
@@ -215,4 +215,5 @@ data/processed/*.txt
215
  data/raw/*/*csv
216
  data/saved_plots/
217
  models/
 
218
  tests/
 
215
  data/raw/*/*csv
216
  data/saved_plots/
217
  models/
218
+ __pycache__/
219
  tests/
gui/app.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import tempfile
4
+ import pandas as pd
5
+ import shutil
6
+ import sys
7
+ import altair as alt
8
+ sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
9
+ from src.utils.bulk_loading import bulk_load_raw_resume_files
10
+ from src.utils.file_reader import extract_text_from_file
11
+ from pipelines.core.applicant import run_tfidf_pipeline as applicant_tfidf, run_bert_pipeline as applicant_bert
12
+ from pipelines.core.recruiter import rank_with_tfidf as recruiter_tfidf, rank_with_bert as recruiter_bert
13
+
14
+ # --- App Configuration ---
15
+ st.set_page_config(
16
+ page_title="Resume-Job Matcher",
17
+ page_icon="πŸ“„",
18
+ layout="wide"
19
+ )
20
+
21
+ # --- Main App ---
22
+ st.title("🎯 AI-Powered Resume-Job Matcher")
23
+ st.write("---")
24
+
25
+ # --- Sidebar for Mode Selection ---
26
+ with st.sidebar:
27
+ st.header("Controls")
28
+ app_mode = st.radio(
29
+ "Choose your view",
30
+ ("Applicant", "Recruiter"),
31
+ help="Select 'Applicant' to match your resume to jobs. Select 'Recruiter' to rank resumes for a job."
32
+ )
33
+ model_choice = st.selectbox(
34
+ "Choose the AI Model",
35
+ ("TF-IDF", "BERT"),
36
+ help="TF-IDF is faster. BERT is more accurate."
37
+ )
38
+
39
+ st.write("---")
40
+
41
+ # Add a checkbox to control the 'show all' feature
42
+ show_all = st.checkbox("Show all matches", value=False)
43
+
44
+ if show_all:
45
+ top_k = None
46
+ # Disable the slider when 'show_all' is checked for better UX
47
+ st.slider(
48
+ "Number of matches to show",
49
+ min_value=1, max_value=50, value=5, step=1,
50
+ disabled=True
51
+ )
52
+ st.info("Showing all ranked results.")
53
+ else:
54
+ # Enable the slider when 'show_all' is unchecked
55
+ top_k = st.slider(
56
+ "Number of matches to show",
57
+ min_value=1, max_value=50, value=5, step=1,
58
+ disabled=False
59
+ )
60
+
61
+
62
+ # --- Applicant View ---
63
+ if app_mode == "Applicant":
64
+ st.header("Applicant: Match Your Resume to a Job")
65
+
66
+ resume_file = st.file_uploader(
67
+ "Upload your resume",
68
+ type=['pdf', 'docx', 'txt'],
69
+ help="Please upload your resume in PDF, DOCX, or TXT format."
70
+ )
71
+
72
+ if resume_file:
73
+ st.success(f"βœ… Successfully uploaded `{resume_file.name}`")
74
+ if st.button("Find Top Job Matches", type="primary", use_container_width=True):
75
+
76
+ with st.spinner(f"Analyzing resume with {model_choice}..."):
77
+
78
+ tmp_file_path = None
79
+ try:
80
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(resume_file.name)[1]) as tmp_file:
81
+ tmp_file.write(resume_file.getvalue())
82
+ tmp_file_path = tmp_file.name
83
+
84
+ raw_resume_text = extract_text_from_file(tmp_file_path)
85
+
86
+ if model_choice == "BERT":
87
+ matches, message = applicant_bert(raw_resume_text, top_k=top_k)
88
+ else:
89
+ matches, message = applicant_tfidf(raw_resume_text, top_k=top_k)
90
+
91
+ if not matches:
92
+ st.warning("⚠️ No suitable job matches found.")
93
+ else:
94
+ st.subheader(f"Top {len(matches)} Job Matches:")
95
+ st.info(message)
96
+
97
+ df = pd.DataFrame(matches, columns=["Job Title", "Match Score"])
98
+
99
+ # Sort the DataFrame by 'Match Score' in descending order to show best matches at the top
100
+ df = df.sort_values(by="Match Score", ascending=False).reset_index(drop=True)
101
+
102
+ chart = alt.Chart(df).mark_bar().encode(
103
+ y=alt.Y('Job Title', sort='-x', title=None),
104
+ x=alt.X('Match Score', axis=None, scale=alt.Scale(domainMin=0)),
105
+
106
+ # Tooltip to reveal score on hover
107
+ tooltip=['Job Title', alt.Tooltip('Match Score', format='.3f')]
108
+ ).properties(
109
+ # Set a responsive title for the chart to indicate what the bars represent
110
+ title="Relative Job Match Scores"
111
+ ).interactive()
112
+
113
+ st.altair_chart(chart, use_container_width=True)
114
+
115
+ except Exception as e:
116
+ st.error(f"An error occurred: {e}")
117
+
118
+ finally:
119
+ if tmp_file_path and os.path.exists(tmp_file_path):
120
+ os.unlink(tmp_file_path)
121
+
122
+
123
+ # --- Recruiter View ---
124
+ if app_mode == "Recruiter":
125
+ st.header("Recruiter: Rank Resumes for a Job Description")
126
+
127
+ job_desc_file = st.file_uploader(
128
+ "Upload the job description",
129
+ type=['pdf', 'docx', 'txt'],
130
+ help="Upload the job description in PDF, DOCX, or TXT format."
131
+ )
132
+
133
+ resume_files = st.file_uploader(
134
+ "Upload candidate resumes",
135
+ type=['pdf', 'docx', 'txt'],
136
+ accept_multiple_files=True,
137
+ help="Upload one or more resumes."
138
+ )
139
+
140
+ if job_desc_file and resume_files:
141
+ st.success(f"βœ… Successfully uploaded job description `{job_desc_file.name}` and {len(resume_files)} resumes.")
142
+ if st.button("Rank Resumes", type="primary", use_container_width=True):
143
+
144
+ with st.spinner(f"Ranking {len(resume_files)} resumes with {model_choice}..."):
145
+
146
+ # Paths for cleanup in the finally block
147
+ temp_dir = None
148
+ job_desc_path = None
149
+
150
+ try:
151
+ # 1. Handle the single job description file
152
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(job_desc_file.name)[1]) as tmp_file:
153
+ tmp_file.write(job_desc_file.getvalue())
154
+ job_desc_path = tmp_file.name
155
+ raw_job_text = extract_text_from_file(job_desc_path)
156
+
157
+ # 2. Handle multiple resume files by creating a temp directory for bulk loading
158
+ temp_dir = tempfile.mkdtemp()
159
+ for resume_file in resume_files:
160
+ resume_path = os.path.join(temp_dir, resume_file.name)
161
+ with open(resume_path, "wb") as f:
162
+ f.write(resume_file.getbuffer())
163
+
164
+ # Bulk loading all resumes from the temp directory
165
+ raw_resume_texts = bulk_load_raw_resume_files(temp_dir)
166
+
167
+ # 3. Call the appropriate model's pipeline based on the model choice (default to TF-IDF)
168
+ if model_choice == "BERT":
169
+ ranked_resumes, message = recruiter_bert(raw_job_text, raw_resume_texts, top_k=top_k)
170
+ else:
171
+ ranked_resumes, message = recruiter_tfidf(raw_job_text, raw_resume_texts, top_k=top_k)
172
+
173
+ # 4. Display results
174
+ if not ranked_resumes:
175
+ st.warning("⚠️ Could not rank resumes. Please check the files.")
176
+ else:
177
+ st.subheader(f"Top {len(ranked_resumes)} Ranked Resumes:")
178
+ st.info(message)
179
+ df = pd.DataFrame(ranked_resumes, columns=["Resume", "Match Score"])
180
+
181
+ df["Match Score"] = df["Match Score"].apply(lambda x: min(1.0, x))
182
+ st.dataframe(
183
+ df,
184
+ column_config={"Resume": st.column_config.TextColumn("Resume"),
185
+ "Match Score": st.column_config.ProgressColumn("Match Score",
186
+ format="%.2f",
187
+ min_value=0,
188
+ max_value=1,),
189
+ },
190
+ use_container_width=True,
191
+ hide_index=True,
192
+ )
193
+
194
+ except Exception as e:
195
+ st.error(f"⚠️An error occurred: {e}")
196
+
197
+ finally:
198
+ # 5. Clean up all temporary files and the directory
199
+ if job_desc_path and os.path.exists(job_desc_path):
200
+ os.unlink(job_desc_path)
201
+ if temp_dir and os.path.exists(temp_dir):
202
+ shutil.rmtree(temp_dir)
pipelines/app_pipeline.py DELETED
@@ -1,179 +0,0 @@
1
- import argparse
2
- import os
3
- import pandas as pd
4
- from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer, load_tfidf_matrix
5
- from src.feature_engg.bert_embedding_data import get_bert_model, load_faiss_index
6
- from src.processing.text_cleaning import clean_text, clean_text_for_bert
7
- from src.matching.matching_engine import compute_similarity_matrix, top_n_tfidf_matches, top_n_bert_matches
8
- from src.utils.file_reader import extract_text_from_file
9
-
10
-
11
- def load_job_titles(job_csv_path: str):
12
- df = pd.read_csv(job_csv_path)
13
- if "title" not in df.columns:
14
- raise ValueError("Job CSV must contain a 'title' column.")
15
- return df
16
-
17
-
18
- # ------------------------- TF-IDF PIPELINE -------------------------
19
- def run_tfidf_pipeline(args, raw_resume: str):
20
-
21
- # Step 1: Clean resume
22
- cleaned_resume = clean_text(raw_resume)
23
-
24
- # Step 2: Load vectorizer + job matrix (local first, fallback HF)
25
- vectorizer = load_tfidf_vectorizer(
26
- local_vectorizer_path=args.local_vectorizer_path,
27
- repo_id=args.tfidf_repo_id,
28
- filename=args.vectorizer_filename
29
- )
30
- job_matrix = load_tfidf_matrix(
31
- local_matrix_path=args.local_matrix_path,
32
- repo_id=args.tfidf_repo_id,
33
- filename=args.matrix_filename
34
- )
35
-
36
- # Step 3: Vectorize resume
37
- resume_vector = vectorizer.transform([cleaned_resume])
38
-
39
- # Step 4: Compute cosine similarity
40
- sim_matrix = compute_similarity_matrix(resume_vector, job_matrix)
41
-
42
- # Step 5: Load job titles
43
- job_df = load_job_titles("data/app_data/tfidf_job_titles.csv")
44
-
45
- # Step 6: Get top-N job matches
46
- top_k = args.top_k
47
-
48
- if args.top_k > len(job_df['title'].unique()):
49
- print(f"⚠️ Requested top_k={args.top_k} exceeds unique job titles={len(job_df['title'].unique())}. Reducing top_k.")
50
- top_k = len(job_df['title'].unique())
51
-
52
- elif args.top_k is None:
53
- top_k = len(job_df['title'].unique())
54
- print(f"\nℹ️ Showing all {top_k} job titles.\n")
55
-
56
- matches = top_n_tfidf_matches(sim_matrix, top_n=top_k, job_df=job_df)
57
-
58
- print(f"\n🎯 Top {top_k} Job Matches for the Resume (TF-IDF):")
59
- for job_idx, score in matches[0]:
60
- print(f"πŸ”Ή {job_df.iloc[job_idx]['title']} (score: {score:0.4f})")
61
-
62
- if args.debug:
63
- print("\n================ DEBUG MODE ================")
64
- print("\nπŸ“„--- [DEBUG - TFIDF] Cleaned Resume Preview:\n", cleaned_resume[:1000], "---")
65
- print(f"\n--- [DEBUG - TFIDF] Raw Similarity Scores (top {top_k}) ---")
66
- for job_idx, score in matches[0]:
67
- print(f"[{job_idx}] {job_df.iloc[job_idx]['title']} β†’ {score:0.6f}")
68
- print("==============================================")
69
-
70
-
71
- # ------------------------- BERT PIPELINE -------------------------
72
- def run_bert_pipeline(args, raw_resume: str):
73
-
74
- # Step 1: Load fine-tuned ST model (local or HF Hub)
75
- model = get_bert_model(args.local_bert_path or args.bert_repo_id)
76
-
77
- # Step 2: Load FAISS index (local or HF Hub)
78
- job_index = load_faiss_index(
79
- local_index_path=args.local_index_path,
80
- repo_id=args.bert_repo_id,
81
- filename=args.index_filename
82
- )
83
-
84
- # Step 3: Clean resume text for transformer
85
- cleaned_resume = clean_text_for_bert(raw_resume)
86
-
87
- # Step 4: Embed
88
- resume_embedding = model.encode(
89
- [cleaned_resume],
90
- normalize_embeddings=True
91
- )
92
-
93
- # Step 5: Search
94
- n_jobs = job_index.ntotal
95
- D, I = job_index.search(resume_embedding, n_jobs)
96
-
97
- # Step 6: Load job titles
98
- job_df = load_job_titles("data/app_data/bert_job_titles.csv")
99
-
100
- # Step 7: Rank top-N
101
- top_k = args.top_k
102
-
103
- if args.top_k > len(job_df['title'].unique()):
104
- print(f"⚠️ Requested top_k={args.top_k} exceeds unique job titles={len(job_df['title'].unique())}. Reducing top_k.")
105
- top_k = len(job_df['title'].unique())
106
-
107
- elif args.top_k is None:
108
- top_k = len(job_df['title'].unique())
109
- print(f"\nℹ️ Showing all {top_k} job titles.\n")
110
-
111
- matches = top_n_bert_matches(I, D, job_df, top_n=top_k)
112
-
113
- print(f"\n🎯 Top {top_k} Job Matches for the Resume (BERT):")
114
- for idx, score in matches:
115
- print(f"πŸ”Ή {job_df.iloc[idx]['title']} (score: {score:0.4f})")
116
-
117
- if args.debug:
118
- print("\n================ DEBUG MODE ================")
119
- print(f"\n--- [DEBUG - BERT/FAISS] Raw Similarity Scores (top {top_k}) ---")
120
- for idx, score in matches:
121
- print(f"[{idx}] {job_df.iloc[idx]['title']} β†’ {score:0.6f}")
122
- print("==============================================")
123
-
124
-
125
- # ------------------------- MAIN -------------------------
126
- def main(args):
127
- try:
128
- if not os.path.exists(args.resume_path):
129
- raise FileNotFoundError(f"⚠️ Resume file not found at: {args.resume_path}")
130
-
131
- raw_resume = extract_text_from_file(args.resume_path)
132
- print(f"\nπŸ“„ Resume: {args.resume_path}")
133
-
134
- # Pipeline selector
135
- print(f"βš™οΈ Using model: {args.model.upper()}")
136
- if args.model == "bert":
137
- run_bert_pipeline(args, raw_resume)
138
- else:
139
- run_tfidf_pipeline(args, raw_resume)
140
-
141
- except Exception as e:
142
- print(f"❌ Error: {str(e)}")
143
-
144
-
145
- if __name__ == "__main__":
146
- parser = argparse.ArgumentParser(description="Match a resume to top relevant job titles")
147
-
148
- # Shared args
149
- parser.add_argument("--resume_path", type=str, required=True, help="Path to resume file")
150
- parser.add_argument("--model", type=str, choices=["tfidf", "bert"], default="tfidf")
151
- parser.add_argument("--top_k", type=int, default=None,
152
- help="Number of top matches to return if not specified, returns all")
153
- parser.add_argument("--debug", action="store_true",
154
- help="print raw similarity scores for both and cleaned resume for tfidf pipeline")
155
-
156
- # TF-IDF args
157
- parser.add_argument("--local_vectorizer_path", type=str, default=None,
158
- help="Local TF-IDF vectorizer .pkl file")
159
- parser.add_argument("--local_matrix_path", type=str, default=None,
160
- help="Local TF-IDF job matrix .npz file")
161
- parser.add_argument("--tfidf_repo_id", type=str, default="Om-Shandilya/resume-matcher-tfidf",
162
- help="Hub repo id for HuggingFace model")
163
- parser.add_argument("--vectorizer_filename", type=str, default="applicant/job_vectorizer.pkl",
164
- help="Filename of vectorizer in the HF repo")
165
- parser.add_argument("--matrix_filename", type=str, default="applicant/job_matrix.npz",
166
- help="Filename of matrix in the HF repo")
167
-
168
- # BERT args
169
- parser.add_argument("--local_bert_path", type=str, default=None,
170
- help="Local fine-tuned ST model path")
171
- parser.add_argument("--local_index_path", type=str, default=None,
172
- help="Local FAISS index file path")
173
- parser.add_argument("--bert_repo_id", type=str, default="Om-Shandilya/resume-matcher-bert",
174
- help="fine-tuned ST model's HF repo id")
175
- parser.add_argument("--index_filename", type=str, default="applicant/jobs.faiss",
176
- help="Filename of FAISS index in the HF repo")
177
-
178
- args = parser.parse_args()
179
- main(args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pipelines/applicant_pipeline.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse, os
2
+ from src.utils.file_reader import extract_text_from_file
3
+ from pipelines.core.applicant import run_tfidf_pipeline, run_bert_pipeline
4
+
5
+ def main(args):
6
+ try:
7
+ if not os.path.exists(args.resume_path):
8
+ raise FileNotFoundError(f"⚠️ Resume not found at {args.resume_path}")
9
+ raw_resume = extract_text_from_file(args.resume_path)
10
+
11
+ if args.model == "bert":
12
+ matches, message = run_bert_pipeline(raw_resume,
13
+ local_bert_path=args.local_bert_path,
14
+ local_index_path=args.local_index_path,
15
+ repo_id=args.bert_repo_id,
16
+ index_filename=args.index_filename,
17
+ top_k=args.top_k,
18
+ debug=args.debug)
19
+ else:
20
+ matches, message = run_tfidf_pipeline(raw_resume,
21
+ local_vectorizer_path=args.local_vectorizer_path,
22
+ local_matrix_path=args.local_matrix_path,
23
+ repo_id=args.tfidf_repo_id,
24
+ vectorizer_filename=args.vectorizer_filename,
25
+ matrix_filename=args.matrix_filename,
26
+ top_k=args.top_k,
27
+ debug=args.debug)
28
+
29
+ print(f"\n{message}")
30
+ print(f"\n🎯 Top {len(matches)} Job Matches ({args.model.upper()}):")
31
+ for fname, score in matches:
32
+ print(f"πŸ”Ή {fname} (score: {score:.4f})")
33
+
34
+ except Exception as e:
35
+ print(f"❌ Error: {str(e)}")
36
+
37
+ if __name__ == "__main__":
38
+ parser = argparse.ArgumentParser(description="Match a resume to top relevant job titles")
39
+ parser.add_argument("--resume_path", type=str, required=True)
40
+ parser.add_argument("--model", choices=["tfidf","bert"], default="tfidf")
41
+ parser.add_argument("--top_k", type=int, default=None)
42
+ parser.add_argument("--debug", action="store_true",
43
+ help="print raw similarity scores for both and cleaned resume for tfidf pipeline")
44
+
45
+ # tfidf args
46
+ parser.add_argument("--local_vectorizer_path", type=str, default=None)
47
+ parser.add_argument("--local_matrix_path", type=str, default=None)
48
+ parser.add_argument("--tfidf_repo_id", type=str, default="Om-Shandilya/resume-matcher-tfidf")
49
+ parser.add_argument("--vectorizer_filename", type=str, default="applicant/job_vectorizer.pkl")
50
+ parser.add_argument("--matrix_filename", type=str, default="applicant/job_matrix.npz")
51
+
52
+ # bert args
53
+ parser.add_argument("--local_bert_path", type=str, default=None)
54
+ parser.add_argument("--local_index_path", type=str, default=None)
55
+ parser.add_argument("--bert_repo_id", type=str, default="Om-Shandilya/resume-matcher-bert")
56
+ parser.add_argument("--index_filename", type=str, default="applicant/jobs.faiss")
57
+
58
+ args = parser.parse_args()
59
+ main(args)
pipelines/core/applicant.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from pathlib import Path
3
+ from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer, load_tfidf_matrix
4
+ from src.feature_engg.bert_embedding_data import load_bert_model, load_faiss_index
5
+ from src.processing.text_cleaning import clean_text, clean_text_for_bert
6
+ from src.matching.matching_engine import compute_similarity_matrix, top_n_tfidf_matches, top_n_bert_matches
7
+
8
+ # Defining paths for data files
9
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
10
+ def load_job_titles(job_csv_path: str):
11
+ df = pd.read_csv(job_csv_path)
12
+ if "title" not in df.columns:
13
+ raise ValueError("Job CSV must contain a 'title' column.")
14
+ return df
15
+
16
+ def run_tfidf_pipeline(raw_resume: str,
17
+ local_vectorizer_path=None,
18
+ local_matrix_path=None,
19
+ repo_id="Om-Shandilya/resume-matcher-tfidf",
20
+ vectorizer_filename="applicant/job_vectorizer.pkl",
21
+ matrix_filename="applicant/job_matrix.npz",
22
+ top_k=None,
23
+ debug=False):
24
+ """Return top-N matches using TF-IDF pipeline.
25
+
26
+ Args:
27
+ raw_resume (str): Raw text of the resume.
28
+ local_vectorizer_path (str, optional): Local path to TF-IDF vectorizer.
29
+ local_matrix_path (str, optional): Local path to TF-IDF matrix.
30
+ repo_id (str): Hugging Face repo ID for vectorizer/matrix.
31
+ vectorizer_filename (str): Filename of the vectorizer in the repo.
32
+ matrix_filename (str): Filename of the matrix in the repo.
33
+ top_k (int, optional): Number of top matches to return. If None, return all.
34
+ debug (bool, optional): Print raw similarity scores for both and cleaned resume.
35
+
36
+ Returns:
37
+ List[Tuple[str, float]]: List of (job_title, score) for top_k matches.
38
+ """
39
+ cleaned_resume = clean_text(raw_resume)
40
+
41
+ vectorizer = load_tfidf_vectorizer(local_vectorizer_path, repo_id, vectorizer_filename)
42
+ job_matrix = load_tfidf_matrix(local_matrix_path, repo_id, matrix_filename)
43
+
44
+ resume_vector = vectorizer.transform([cleaned_resume])
45
+ sim_matrix = compute_similarity_matrix(resume_vector, job_matrix)
46
+
47
+ job_df = load_job_titles(PROJECT_ROOT / "data/app_data/tfidf_job_titles.csv")
48
+ total_jobs = len(job_df['title'].unique())
49
+
50
+ message = ""
51
+ if top_k is None:
52
+ final_top_k = total_jobs
53
+ message = f"βœ… Showing all {total_jobs} job matches, ranked by relevance."
54
+ elif top_k > total_jobs:
55
+ final_top_k = total_jobs
56
+ message = f"ℹ️ You requested {top_k} matches, but only {total_jobs} are available. Showing all {total_jobs} matches."
57
+ else:
58
+ final_top_k = top_k
59
+ message = f"βœ… Showing the top {final_top_k} job matches."
60
+
61
+ matches = top_n_tfidf_matches(sim_matrix, top_n=final_top_k, job_df=job_df)
62
+
63
+ if debug:
64
+ print("\n================ DEBUG MODE ================")
65
+ print("\nπŸ“„--- [DEBUG - TFIDF] Cleaned Resume Preview:\n", cleaned_resume[:1000], "---")
66
+ print(f"\n--- [DEBUG - TFIDF] Raw Similarity Scores (top {final_top_k}) ---")
67
+ for job_idx, score in matches[0]:
68
+ print(f"[{job_idx}] {job_df.iloc[job_idx]['title']} β†’ {score:0.6f}")
69
+ print("==============================================")
70
+
71
+ return [(job_df.iloc[j]['title'], score) for j, score in matches[0]],message
72
+
73
+
74
+ def run_bert_pipeline(raw_resume: str,
75
+ local_bert_path=None,
76
+ local_index_path=None,
77
+ repo_id="Om-Shandilya/resume-matcher-bert",
78
+ index_filename="applicant/jobs.faiss",
79
+ top_k=None,
80
+ debug=False):
81
+ """Return top-N matches using BERT + FAISS pipeline.
82
+
83
+ Args:
84
+ raw_resume (str): Raw text of the resume.
85
+ local_bert_path (str, optional): Local path to BERT model.
86
+ local_index_path (str, optional): Local path to FAISS index.
87
+ repo_id (str): Hugging Face repo ID for model/index.
88
+ index_filename (str): Filename of the FAISS index in the repo.
89
+ top_k (int, optional): Number of top matches to return. If None, return all.
90
+ debug (bool, optional): Print raw similarity scores for both and cleaned resume.
91
+
92
+ Returns:
93
+ List[Tuple[str, float]]: List of (job_title, score) for top_k matches.
94
+ """
95
+ model = load_bert_model(local_bert_path=local_bert_path, repo_id=repo_id)
96
+ job_index = load_faiss_index(local_index_path, repo_id, index_filename)
97
+
98
+ cleaned_resume = clean_text_for_bert(raw_resume)
99
+ resume_embedding = model.encode([cleaned_resume], normalize_embeddings=True)
100
+
101
+ D, I = job_index.search(resume_embedding, job_index.ntotal)
102
+ job_df = load_job_titles(PROJECT_ROOT / "data/app_data/bert_job_titles.csv")
103
+ total_jobs = len(job_df['title'].unique())
104
+
105
+ message = ""
106
+ if top_k is None:
107
+ final_top_k = total_jobs
108
+ message = f"βœ… Showing all {total_jobs} job matches, ranked by relevance."
109
+ elif top_k > total_jobs:
110
+ final_top_k = total_jobs
111
+ message = f"ℹ️ You requested {top_k} matches, but only {total_jobs} are available. Showing all {total_jobs} matches."
112
+ else:
113
+ final_top_k = top_k
114
+ message = f"βœ… Showing the top {final_top_k} job matches."
115
+
116
+ matches = top_n_bert_matches(I, D, job_df, top_n=final_top_k)
117
+
118
+ if debug:
119
+ print("\n================ DEBUG MODE ================")
120
+ print(f"\n--- [DEBUG - BERT/FAISS] Raw Similarity Scores (top {final_top_k}) ---")
121
+ for idx, score in matches:
122
+ print(f"[{idx}] {job_df.iloc[idx]['title']} β†’ {score:0.6f}")
123
+ print("==============================================")
124
+
125
+ return [(job_df.iloc[i]['title'], score) for i, score in matches], message
pipelines/core/recruiter.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.metrics.pairwise import cosine_similarity
3
+ from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer
4
+ from src.feature_engg.bert_embedding_data import load_bert_model
5
+ from src.processing.text_cleaning import clean_text, clean_text_for_bert
6
+
7
+
8
+ def rank_with_tfidf(raw_job_text, raw_resume_texts, *,
9
+ local_vectorizer_path=None,
10
+ repo_id="Om-Shandilya/resume-matcher-tfidf",
11
+ filename="recruiter/combined_vectorizer.pkl",
12
+ top_k=None,
13
+ debug=False):
14
+ """Rank resumes using TF-IDF similarity."""
15
+ vectorizer = load_tfidf_vectorizer(
16
+ local_vectorizer_path=local_vectorizer_path,
17
+ repo_id=repo_id,
18
+ filename=filename
19
+ )
20
+
21
+ cleaned_job_text = clean_text(raw_job_text)
22
+ job_vector = vectorizer.transform([cleaned_job_text])
23
+
24
+ cleaned_resumes = {fname: clean_text(txt) for fname, txt in raw_resume_texts.items()}
25
+ resume_matrix = vectorizer.transform(cleaned_resumes.values())
26
+
27
+ sims = cosine_similarity(job_vector, resume_matrix)[0]
28
+ ranked = sorted(zip(cleaned_resumes.keys(), sims), key=lambda x: x[1], reverse=True)
29
+
30
+ available_resumes = len(ranked)
31
+
32
+ message = ""
33
+ if top_k is None:
34
+ final_top_k = available_resumes
35
+ message = f"βœ… Showing all {available_resumes} job matches, ranked by relevance."
36
+ elif top_k > available_resumes:
37
+ final_top_k = available_resumes
38
+ message = f"ℹ️ You requested {top_k} matches, but only {available_resumes} are available. Showing all {available_resumes} matches."
39
+ else:
40
+ final_top_k = top_k
41
+ message = f"βœ… Showing the top {final_top_k} job matches."
42
+
43
+ if debug:
44
+ print("\n================ DEBUG MODE ================")
45
+ print("\nπŸ“„--- [DEBUG - TFIDF] Cleaned Job Description Preview:\n", cleaned_job_text[:1000], "---")
46
+ print("\n--- [DEBUG - TFIDF] First 3 Cleaned Resumes ---")
47
+ for i, (fname, txt) in enumerate(cleaned_resumes.items()):
48
+ if i >= 3: break
49
+ print(f"{fname}: {txt[:300]}...\n")
50
+ print("\n--- [DEBUG - TFIDF] Raw Similarity Scores ---")
51
+ for fname, score in ranked[:final_top_k]:
52
+ print(f"{fname} β†’ {score:0.6f}")
53
+ print("==============================================")
54
+
55
+ return [(fname, score) for fname, score in ranked[:final_top_k]], message
56
+
57
+
58
+ def rank_with_bert(raw_job_text, raw_resume_texts, *,
59
+ local_bert_path=None,
60
+ repo_id="Om-Shandilya/resume-matcher-bert",
61
+ top_k=None,
62
+ debug=False):
63
+ """Rank resumes using BERT embeddings."""
64
+ model = load_bert_model(local_bert_path=local_bert_path, repo_id=repo_id)
65
+
66
+ cleaned_job_text = clean_text_for_bert(raw_job_text)
67
+ job_embedding = model.encode([cleaned_job_text], normalize_embeddings=True)
68
+
69
+ cleaned_resumes = {fname: clean_text_for_bert(txt) for fname, txt in raw_resume_texts.items()}
70
+ resume_embeddings = model.encode(list(cleaned_resumes.values()), normalize_embeddings=True)
71
+
72
+ sims = np.dot(resume_embeddings, job_embedding.T).flatten()
73
+ ranked = sorted(zip(cleaned_resumes.keys(), sims), key=lambda x: x[1], reverse=True)
74
+
75
+ available_resumes = len(ranked)
76
+
77
+ message = ""
78
+ if top_k is None:
79
+ final_top_k = available_resumes
80
+ message = f"βœ… Showing all {available_resumes} job matches, ranked by relevance."
81
+ elif top_k > available_resumes:
82
+ final_top_k = available_resumes
83
+ message = f"ℹ️ You requested {top_k} matches, but only {available_resumes} are available. Showing all {available_resumes} matches."
84
+ else:
85
+ final_top_k = top_k
86
+ message = f"βœ… Showing the top {final_top_k} job matches."
87
+
88
+ if debug:
89
+ print("\n================ DEBUG MODE ================")
90
+ print("\nπŸ“„--- [DEBUG - BERT] Cleaned Job Description Preview:\n", cleaned_job_text[:1000], "---")
91
+ print("\n--- [DEBUG - BERT] First 3 Cleaned Resumes ---")
92
+ for i, (fname, txt) in enumerate(cleaned_resumes.items()):
93
+ if i >= 3: break
94
+ print(f"{fname}: {txt[:300]}...\n")
95
+ print("\n--- [DEBUG - BERT] Raw Similarity Scores ---")
96
+ for fname, score in ranked[:final_top_k]:
97
+ print(f"{fname} β†’ {score:0.6f}")
98
+ print("==============================================")
99
+
100
+ return [(fname, score) for fname, score in ranked[:final_top_k]], message
pipelines/recruiter_pipeline.py CHANGED
@@ -1,137 +1,51 @@
1
  import argparse
2
  import os
3
- import numpy as np
4
- from sklearn.metrics.pairwise import cosine_similarity
5
- from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer
6
- from src.feature_engg.bert_embedding_data import get_bert_model
7
  from src.utils.bulk_loading import bulk_load_raw_resume_files
8
  from src.utils.file_reader import extract_text_from_file
9
- from src.processing.text_cleaning import clean_text, clean_text_for_bert
10
 
11
 
12
- # ------------------------- TF-IDF PIPELINE -------------------------
13
- def run_tfidf_pipeline(args, raw_job_text, raw_resume_texts):
14
- # Step 1: Load vectorizer (local or HF Hub)
15
- vectorizer = load_tfidf_vectorizer(
16
- local_vectorizer_path=args.local_vectorizer_path,
17
- repo_id=args.tfidf_repo_id,
18
- filename=args.vectorizer_filename
19
- )
20
-
21
- # Step 2: Clean job description
22
- cleaned_job_text = clean_text(raw_job_text)
23
- job_vector = vectorizer.transform([cleaned_job_text])
24
-
25
- # Step 3: Clean and vectorize resumes
26
- cleaned_resumes = {fname: clean_text(txt) for fname, txt in raw_resume_texts.items()}
27
- resume_matrix = vectorizer.transform(cleaned_resumes.values())
28
-
29
- # Step 4: Compute similarity
30
- sims = cosine_similarity(job_vector, resume_matrix)[0]
31
-
32
- # Step 5: Rank resumes
33
- ranked = sorted(zip(cleaned_resumes.keys(), sims), key=lambda x: x[1], reverse=True)
34
-
35
- # Step 6: Top-K handling
36
- top_k = args.top_k
37
- available_resumes = len(ranked)
38
-
39
- if args.top_k is None:
40
- top_k = available_resumes
41
- print(f"\nℹ️ Showing all {available_resumes} resumes.\n")
42
- elif args.top_k > available_resumes:
43
- top_k = available_resumes
44
- print(f"\n⚠️ Requested top_k={args.top_k} exceeds available resumes={available_resumes}. Reducing top_k.\n")
45
-
46
- print(f"\n🎯 Top {top_k} Candidate Matches for the Job (TF-IDF):")
47
- for i, (fname, score) in enumerate(ranked[:top_k], 1):
48
- print(f"{i}. {fname} β†’ score: {score:.4f}")
49
-
50
- if args.debug:
51
- print("\n================ DEBUG MODE ================")
52
- print("\nπŸ“„--- [DEBUG - TFIDF] Cleaned Job Description Preview:\n", cleaned_job_text[:1000], "---")
53
- print("\n--- [DEBUG - TFIDF] First 3 Cleaned Resumes ---")
54
- for i, (fname, txt) in enumerate(cleaned_resumes.items()):
55
- if i >= 3: break
56
- print(f"{fname}: {txt[:300]}...\n")
57
- print(f"\n--- [DEBUG - TFIDF] Raw Similarity Scores (top {top_k}) ---")
58
- for fname, score in ranked[:top_k]:
59
- print(f"{fname} β†’ {score:0.6f}")
60
- print("==============================================")
61
-
62
-
63
- # ------------------------- BERT PIPELINE -------------------------
64
- def run_bert_pipeline(args, raw_job_text, raw_resume_texts):
65
- # Step 1: Load fine-tuned ST model (local or HF Hub)
66
- model = get_bert_model(args.local_bert_path or args.bert_repo_id)
67
-
68
- # Step 2: Clean job description
69
- cleaned_job_text = clean_text_for_bert(raw_job_text)
70
- job_embedding = model.encode([cleaned_job_text], normalize_embeddings=True)
71
-
72
- # Step 3: Encode resumes
73
- cleaned_resumes = {fname: clean_text_for_bert(txt) for fname, txt in raw_resume_texts.items()}
74
- resume_embeddings = model.encode(list(cleaned_resumes.values()), normalize_embeddings=True)
75
-
76
- # Step 4: Compute cosine similarity manually
77
- # Using dot product as embeddings are normalized and not FAISS since we have small data here.
78
- sims = np.dot(resume_embeddings, job_embedding.T).flatten()
79
-
80
- # Step 5: Rank resumes
81
- ranked = sorted(zip(cleaned_resumes.keys(), sims), key=lambda x: x[1], reverse=True)
82
-
83
- # Step 6: Top-K handling
84
- top_k = args.top_k
85
- available_resumes = len(ranked)
86
-
87
- if args.top_k is None:
88
- top_k = available_resumes
89
- print(f"\nℹ️ Showing all {available_resumes} resumes.\n")
90
- elif args.top_k > available_resumes:
91
- top_k = available_resumes
92
- print(f"\n⚠️ Requested top_k={args.top_k} exceeds available resumes={available_resumes}. Reducing top_k.\n")
93
-
94
- print(f"\n🎯 Top {top_k} Candidate Matches for the Job (BERT):")
95
- for i, (fname, score) in enumerate(ranked[:top_k], 1):
96
- print(f"{i}. {fname} β†’ score: {score:.4f}")
97
-
98
- if args.debug:
99
- print("\n================ DEBUG MODE ================")
100
- print("\nπŸ“„--- [DEBUG - BERT] Cleaned Job Description Preview:\n", cleaned_job_text[:1000], "---")
101
- print("\n--- [DEBUG - BERT] First 3 Cleaned Resumes ---")
102
- for i, (fname, txt) in enumerate(cleaned_resumes.items()):
103
- if i >= 3: break
104
- print(f"{fname}: {txt[:300]}...\n")
105
- print(f"\n--- [DEBUG - BERT] Raw Similarity Scores (top {top_k}) ---")
106
- for fname, score in ranked[:top_k]:
107
- print(f"{fname} β†’ {score:0.6f}")
108
- print("==============================================")
109
-
110
-
111
- # ------------------------- MAIN -------------------------
112
  def main(args):
113
  try:
114
- # Load job description and resumes
115
  if not os.path.exists(args.job_desc_path):
116
  raise FileNotFoundError(f"⚠️ Job description not found: {args.job_desc_path}")
 
117
  raw_job_text = extract_text_from_file(args.job_desc_path)
118
 
119
  if not os.path.exists(args.resume_dir):
120
  raise FileNotFoundError(f"⚠️ Resume directory not found: {args.resume_dir}")
 
121
  raw_resume_texts = bulk_load_raw_resume_files(args.resume_dir)
122
 
123
  if not raw_resume_texts:
124
  raise ValueError("⚠️ No valid resumes found in the given directory.")
125
 
126
- print(f"\nπŸ“„ Job Description: {args.job_desc_path}")
127
  print(f"πŸ“‚ Loaded {len(raw_resume_texts)} resumes from {args.resume_dir}")
 
128
 
129
- # Pipeline selector
130
- print(f"βš™οΈ Using model: {args.model.upper()}")
131
  if args.model == "bert":
132
- run_bert_pipeline(args, raw_job_text, raw_resume_texts)
 
 
 
 
 
 
133
  else:
134
- run_tfidf_pipeline(args, raw_job_text, raw_resume_texts)
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
  except Exception as e:
137
  print(f"❌ Error: {str(e)}")
@@ -140,28 +54,20 @@ def main(args):
140
  if __name__ == "__main__":
141
  parser = argparse.ArgumentParser(description="Recruiter Pipeline: Rank resumes for a given job description")
142
 
143
- # Shared args
144
- parser.add_argument("--job_desc_path", type=str, required=True, help="Path to job description file")
145
- parser.add_argument("--resume_dir", type=str, required=True, help="Directory containing applicant resumes")
146
  parser.add_argument("--model", type=str, choices=["tfidf", "bert"], default="tfidf")
147
- parser.add_argument("--top_k", type=int, default=None,
148
- help="Number of top matches to return if not specified, returns all")
149
- parser.add_argument("--debug", action="store_true",
150
- help="print raw similarity scores and cleaned texts for debugging")
151
 
152
  # TF-IDF args
153
- parser.add_argument("--local_vectorizer_path", type=str, default=None,
154
- help="Local TF-IDF vectorizer .pkl file")
155
- parser.add_argument("--tfidf_repo_id", type=str, default="Om-Shandilya/resume-matcher-tfidf",
156
- help="Hub repo id for HuggingFace TF-IDF model")
157
- parser.add_argument("--vectorizer_filename", type=str, default="recruiter/combined_vectorizer.pkl",
158
- help="Filename of vectorizer in the HF repo")
159
 
160
  # BERT args
161
- parser.add_argument("--local_bert_path", type=str, default=None,
162
- help="Local fine-tuned ST model path")
163
- parser.add_argument("--bert_repo_id", type=str, default="Om-Shandilya/resume-matcher-bert",
164
- help="fine-tuned ST model's HF repo id")
165
 
166
  args = parser.parse_args()
167
  main(args)
 
1
  import argparse
2
  import os
 
 
 
 
3
  from src.utils.bulk_loading import bulk_load_raw_resume_files
4
  from src.utils.file_reader import extract_text_from_file
5
+ from pipelines.core import recruiter
6
 
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def main(args):
9
  try:
 
10
  if not os.path.exists(args.job_desc_path):
11
  raise FileNotFoundError(f"⚠️ Job description not found: {args.job_desc_path}")
12
+
13
  raw_job_text = extract_text_from_file(args.job_desc_path)
14
 
15
  if not os.path.exists(args.resume_dir):
16
  raise FileNotFoundError(f"⚠️ Resume directory not found: {args.resume_dir}")
17
+
18
  raw_resume_texts = bulk_load_raw_resume_files(args.resume_dir)
19
 
20
  if not raw_resume_texts:
21
  raise ValueError("⚠️ No valid resumes found in the given directory.")
22
 
23
+ print(f"\nπŸ“„ Loaded Job Description: {args.job_desc_path}")
24
  print(f"πŸ“‚ Loaded {len(raw_resume_texts)} resumes from {args.resume_dir}")
25
+ print(f"βš™οΈ Using model: {args.model.upper()}")
26
 
 
 
27
  if args.model == "bert":
28
+ matches, message = recruiter.rank_with_bert(raw_job_text,
29
+ raw_resume_texts,
30
+ local_bert_path=args.local_bert_path,
31
+ repo_id=args.bert_repo_id,
32
+ top_k=args.top_k,
33
+ debug=args.debug
34
+ )
35
  else:
36
+ matches, message = recruiter.rank_with_tfidf(raw_job_text,
37
+ raw_resume_texts,
38
+ local_vectorizer_path=args.local_vectorizer_path,
39
+ repo_id=args.tfidf_repo_id,
40
+ filename=args.vectorizer_filename,
41
+ top_k=args.top_k,
42
+ debug=args.debug
43
+ )
44
+
45
+ print(f"\n{message}")
46
+ print(f"\n🎯 Top {len(matches)} Job Matches ({args.model.upper()}):")
47
+ for i, (job, score) in enumerate(matches):
48
+ print(f"{i+1})-> {job} (score: {score:.4f})")
49
 
50
  except Exception as e:
51
  print(f"❌ Error: {str(e)}")
 
54
  if __name__ == "__main__":
55
  parser = argparse.ArgumentParser(description="Recruiter Pipeline: Rank resumes for a given job description")
56
 
57
+ parser.add_argument("--job_desc_path", type=str, required=True)
58
+ parser.add_argument("--resume_dir", type=str, required=True)
 
59
  parser.add_argument("--model", type=str, choices=["tfidf", "bert"], default="tfidf")
60
+ parser.add_argument("--top_k", type=int, default=None)
61
+ parser.add_argument("--debug", action="store_true")
 
 
62
 
63
  # TF-IDF args
64
+ parser.add_argument("--local_vectorizer_path", type=str, default=None)
65
+ parser.add_argument("--tfidf_repo_id", type=str, default="Om-Shandilya/resume-matcher-tfidf")
66
+ parser.add_argument("--vectorizer_filename", type=str, default="recruiter/combined_vectorizer.pkl")
 
 
 
67
 
68
  # BERT args
69
+ parser.add_argument("--local_bert_path", type=str, default=None)
70
+ parser.add_argument("--bert_repo_id", type=str, default="Om-Shandilya/resume-matcher-bert")
 
 
71
 
72
  args = parser.parse_args()
73
  main(args)
src/feature_engg/bert_embedding_data.py CHANGED
@@ -11,19 +11,20 @@ from transformers import AutoTokenizer, AutoModel
11
  from huggingface_hub import hf_hub_download
12
 
13
 
14
- def get_bert_model(model_name: str = "all-MiniLM-L6-v2",
15
  device: str = None):
16
  """
17
  Loads a BERT-based sentence transformer model for embeddings.
18
 
19
  Args:
20
- model_name (str): HuggingFace model name. Default is "all-MiniLM-L6-v2".
21
  device (str, optional): "cuda", "cpu", or None (auto-detect).
22
 
23
  Returns:
24
  SentenceTransformer: Loaded model ready for encoding.
25
  """
26
  device = device or ("cuda" if torch.cuda.is_available() else "cpu")
 
27
  return SentenceTransformer(model_name, device=device)
28
 
29
 
@@ -102,25 +103,44 @@ def bert_embed_text(df: pd.DataFrame,
102
 
103
  def load_faiss_index(local_index_path: str, repo_id: str, filename: str):
104
  """Load FAISS index, preferring local then HF Hub."""
105
- if local_index_path and os.path.exists(local_index_path):
 
 
106
  print(f"πŸ“‚ Loading local FAISS index from {local_index_path}")
107
- return read_index(local_index_path)
108
- else:
109
- print(f"🌐 Downloading FAISS index from Hugging Face Hub ({repo_id})")
110
- faiss_path = hf_hub_download(repo_id=repo_id, filename=filename)
111
- return read_index(faiss_path)
112
-
113
- def load_bert_model(local_model_path: str, repo_id: str):
114
- """Load BERT model, preferring local then HF Hub."""
115
- if local_model_path and os.path.exists(local_model_path):
116
- print(f"πŸ“‚ Loading local BERT model from {local_model_path}")
117
- tokenizer = AutoTokenizer.from_pretrained(local_model_path)
118
- model = AutoModel.from_pretrained(local_model_path)
119
- else:
120
- print(f"🌐 Downloading BERT model from Hugging Face Hub ({repo_id})")
121
- tokenizer = AutoTokenizer.from_pretrained(repo_id)
122
- model = AutoModel.from_pretrained(repo_id)
123
- return tokenizer, model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  def mean_pooling(model_output, attention_mask):
126
  """Mean pooling for sentence embeddings."""
 
11
  from huggingface_hub import hf_hub_download
12
 
13
 
14
+ def get_bert_model(model_name: str,
15
  device: str = None):
16
  """
17
  Loads a BERT-based sentence transformer model for embeddings.
18
 
19
  Args:
20
+ model_name (str): Hugging Face model name or path.
21
  device (str, optional): "cuda", "cpu", or None (auto-detect).
22
 
23
  Returns:
24
  SentenceTransformer: Loaded model ready for encoding.
25
  """
26
  device = device or ("cuda" if torch.cuda.is_available() else "cpu")
27
+ print(f"πŸ“‚ Loading BERT model '{model_name}' on device: {device}")
28
  return SentenceTransformer(model_name, device=device)
29
 
30
 
 
103
 
104
  def load_faiss_index(local_index_path: str, repo_id: str, filename: str):
105
  """Load FAISS index, preferring local then HF Hub."""
106
+ if local_index_path:
107
+ if not os.path.exists(local_index_path):
108
+ raise FileNotFoundError(f"❌ Local FAISS index not found at {local_index_path}")
109
  print(f"πŸ“‚ Loading local FAISS index from {local_index_path}")
110
+ return faiss.read_index(local_index_path)
111
+
112
+ print(f"🌐 Downloading FAISS index from Hugging Face Hub ({repo_id}/{filename})")
113
+ faiss_path = hf_hub_download(repo_id=repo_id, filename=filename)
114
+ return faiss.read_index(faiss_path)
115
+
116
+ def load_bert_model(local_bert_path: str, repo_id: str='Om-Shandilya/resume-matcher-bert'):
117
+ """
118
+ Load a SentenceTransformer BERT model:
119
+ - If local_model_path is provided, it must be a valid path.
120
+ - If local_model_path is None, download from Hugging Face Hub.
121
+ """
122
+ if local_bert_path is None:
123
+ try:
124
+ print(f"🌐 Downloading BERT model from Hugging Face Hub ({repo_id})")
125
+ model = SentenceTransformer(repo_id)
126
+ return model
127
+ except Exception as e:
128
+ raise RuntimeError(f"❌ Failed to download model from Hugging Face Hub ({repo_id}). Error: {e}")
129
+
130
+
131
+ if not os.path.exists(local_bert_path):
132
+ raise FileNotFoundError(
133
+ f"❌ The specified local path does not exist: '{local_bert_path}'. "
134
+ "Please provide a correct path or set it to None to download from the Hub."
135
+ )
136
+
137
+ try:
138
+ print(f"πŸ“‚ Loading local BERT model from {local_bert_path}")
139
+ model = SentenceTransformer(local_bert_path)
140
+ return model
141
+ except Exception as e:
142
+ raise RuntimeError(f"❌ Failed to load local model from '{local_bert_path}'. Error: {e}")
143
+
144
 
145
  def mean_pooling(model_output, attention_mask):
146
  """Mean pooling for sentence embeddings."""
src/feature_engg/tfidf_vectorizing_data.py CHANGED
@@ -111,21 +111,24 @@ def tfidf_vectorize_text(df: pd.DataFrame,
111
 
112
  def load_tfidf_vectorizer(local_vectorizer_path: str, repo_id: str, filename: str):
113
  """Load TF-IDF vectorizer, preferring local then HF Hub."""
114
- if local_vectorizer_path and os.path.exists(local_vectorizer_path):
 
 
115
  print(f"πŸ“‚ Loading local TF-IDF vectorizer from {local_vectorizer_path}")
116
  return joblib.load(local_vectorizer_path)
117
- else:
118
- print(f"🌐 Downloading TF-IDF vectorizer from Hugging Face Hub ({repo_id})")
119
- vec_path = hf_hub_download(repo_id=repo_id, filename=filename)
120
- return joblib.load(vec_path)
121
 
 
 
 
122
 
123
  def load_tfidf_matrix(local_matrix_path: str, repo_id: str, filename: str):
124
  """Load TF-IDF matrix, preferring local then HF Hub."""
125
- if local_matrix_path and os.path.exists(local_matrix_path):
 
 
126
  print(f"πŸ“‚ Loading local TF-IDF matrix from {local_matrix_path}")
127
  return load_npz(local_matrix_path)
128
- else:
129
- print(f"🌐 Downloading TF-IDF matrix from Hugging Face Hub ({repo_id})")
130
- mat_path = hf_hub_download(repo_id=repo_id, filename=filename)
131
- return load_npz(mat_path)
 
111
 
112
  def load_tfidf_vectorizer(local_vectorizer_path: str, repo_id: str, filename: str):
113
  """Load TF-IDF vectorizer, preferring local then HF Hub."""
114
+ if local_vectorizer_path:
115
+ if not os.path.exists(local_vectorizer_path):
116
+ raise FileNotFoundError(f"❌ Local TF-IDF vectorizer not found at {local_vectorizer_path}")
117
  print(f"πŸ“‚ Loading local TF-IDF vectorizer from {local_vectorizer_path}")
118
  return joblib.load(local_vectorizer_path)
 
 
 
 
119
 
120
+ print(f"🌐 Downloading TF-IDF vectorizer from Hugging Face Hub ({repo_id}/{filename})")
121
+ vec_path = hf_hub_download(repo_id=repo_id, filename=filename)
122
+ return joblib.load(vec_path)
123
 
124
  def load_tfidf_matrix(local_matrix_path: str, repo_id: str, filename: str):
125
  """Load TF-IDF matrix, preferring local then HF Hub."""
126
+ if local_matrix_path:
127
+ if not os.path.exists(local_matrix_path):
128
+ raise FileNotFoundError(f"❌ Local TF-IDF matrix not found at {local_matrix_path}")
129
  print(f"πŸ“‚ Loading local TF-IDF matrix from {local_matrix_path}")
130
  return load_npz(local_matrix_path)
131
+
132
+ print(f"🌐 Downloading TF-IDF matrix from Hugging Face Hub ({repo_id}/{filename})")
133
+ mat_path = hf_hub_download(repo_id=repo_id, filename=filename)
134
+ return load_npz(mat_path)
src/utils/file_reader.py CHANGED
@@ -53,10 +53,13 @@ def extract_text_from_file(file_path):
53
 
54
  ext = os.path.splitext(file_path)[1].lower()
55
  if ext == '.pdf':
 
56
  return extract_text_from_pdf(file_path)
57
  elif ext == '.docx':
 
58
  return extract_text_from_docx(file_path)
59
  elif ext == '.txt':
 
60
  return extract_text_from_txt(file_path)
61
  else:
62
  raise ValueError(f"Unsupported file type: {ext}")
 
53
 
54
  ext = os.path.splitext(file_path)[1].lower()
55
  if ext == '.pdf':
56
+ print(f"Extracting text from PDF {file_path}")
57
  return extract_text_from_pdf(file_path)
58
  elif ext == '.docx':
59
+ print(f"Extracting text from DOCX {file_path}")
60
  return extract_text_from_docx(file_path)
61
  elif ext == '.txt':
62
+ print(f"Extracting text from TXT {file_path}")
63
  return extract_text_from_txt(file_path)
64
  else:
65
  raise ValueError(f"Unsupported file type: {ext}")