Spaces:
Running
Running
Commit
Β·
f6d8deb
1
Parent(s):
65281cf
Pipelines with local model use
Browse files- pipelines/app_pipeline.py +7 -7
- pipelines/dev_pipeline.py +5 -5
- pipelines/recruiter_pipeline.py +107 -41
- src/feature_engg/tfidf_vectorizing_data.py +6 -6
- src/utils/bulk_loading.py +11 -11
pipelines/app_pipeline.py
CHANGED
@@ -43,12 +43,12 @@ def run_tfidf_pipeline(args, raw_resume: str):
|
|
43 |
|
44 |
# Optional debug
|
45 |
if args.debug:
|
46 |
-
print("\n
|
47 |
-
print("\n
|
48 |
-
print("\n---
|
49 |
for job_idx, score in matches[0]:
|
50 |
print(f"[{job_idx}] {job_df.iloc[job_idx]['title']} β {score:0.6f}")
|
51 |
-
print("
|
52 |
|
53 |
|
54 |
def run_bert_pipeline(args, raw_resume: str):
|
@@ -77,11 +77,11 @@ def run_bert_pipeline(args, raw_resume: str):
|
|
77 |
|
78 |
# Optional debug
|
79 |
if args.debug:
|
80 |
-
print("\n
|
81 |
-
print("\n---
|
82 |
for idx, score in matches:
|
83 |
print(f"πΉ {job_df.iloc[idx]['title']} (score: {score})")
|
84 |
-
print("
|
85 |
|
86 |
|
87 |
def main(args):
|
|
|
43 |
|
44 |
# Optional debug
|
45 |
if args.debug:
|
46 |
+
print("\n================ DEBUG MODE ================")
|
47 |
+
print("\nπ--- [DEBUG - TFIDF] Cleaned Resume Preview:\n", cleaned_resume[:1000], "---")
|
48 |
+
print(f"\n--- [DEBUG - TFIDF] Raw Similarity Scores (top {args.top_k}) ---")
|
49 |
for job_idx, score in matches[0]:
|
50 |
print(f"[{job_idx}] {job_df.iloc[job_idx]['title']} β {score:0.6f}")
|
51 |
+
print("==============================================")
|
52 |
|
53 |
|
54 |
def run_bert_pipeline(args, raw_resume: str):
|
|
|
77 |
|
78 |
# Optional debug
|
79 |
if args.debug:
|
80 |
+
print("\n================ DEBUG MODE ================")
|
81 |
+
print(f"\n--- [DEBUG - BERT/FAISS] Raw Similarity Scores (top {args.top_k}) ---")
|
82 |
for idx, score in matches:
|
83 |
print(f"πΉ {job_df.iloc[idx]['title']} (score: {score})")
|
84 |
+
print("==============================================")
|
85 |
|
86 |
|
87 |
def main(args):
|
pipelines/dev_pipeline.py
CHANGED
@@ -37,9 +37,9 @@ if __name__ == "__main__":
|
|
37 |
|
38 |
|
39 |
# Step 4: Vectorize using shared TF-IDF vectorizer
|
40 |
-
from src.feature_engg.
|
41 |
-
|
42 |
-
)
|
43 |
|
44 |
print("\nπ» Vectorizing text using shared TF-IDF vectorizer...")
|
45 |
|
@@ -54,13 +54,13 @@ vector_save_dir = "models/dev_tfidf"
|
|
54 |
os.makedirs(vector_save_dir, exist_ok=True)
|
55 |
|
56 |
# Transform resumes and jobs separately using the same vectorizer
|
57 |
-
X_resumes, _ =
|
58 |
df_resumes, text_column="text_cleaned", label="resumes",
|
59 |
vectorizer=shared_vectorizer, fit_vectorizer=False,
|
60 |
save_path=vector_save_dir, save_vectorizer_file=False # Weβll save manually below
|
61 |
)
|
62 |
|
63 |
-
X_jobs, _ =
|
64 |
df_jobs, text_column="text_cleaned", label="jobs",
|
65 |
vectorizer=shared_vectorizer, fit_vectorizer=False,
|
66 |
save_path=vector_save_dir, save_vectorizer_file=False
|
|
|
37 |
|
38 |
|
39 |
# Step 4: Vectorize using shared TF-IDF vectorizer
|
40 |
+
from src.feature_engg.tfidf_vectorizing_data import (get_tfidf_vectorizer,
|
41 |
+
tfidf_vectorize_text,
|
42 |
+
save_vectorizer)
|
43 |
|
44 |
print("\nπ» Vectorizing text using shared TF-IDF vectorizer...")
|
45 |
|
|
|
54 |
os.makedirs(vector_save_dir, exist_ok=True)
|
55 |
|
56 |
# Transform resumes and jobs separately using the same vectorizer
|
57 |
+
X_resumes, _ = tfidf_vectorize_text(
|
58 |
df_resumes, text_column="text_cleaned", label="resumes",
|
59 |
vectorizer=shared_vectorizer, fit_vectorizer=False,
|
60 |
save_path=vector_save_dir, save_vectorizer_file=False # Weβll save manually below
|
61 |
)
|
62 |
|
63 |
+
X_jobs, _ = tfidf_vectorize_text(
|
64 |
df_jobs, text_column="text_cleaned", label="jobs",
|
65 |
vectorizer=shared_vectorizer, fit_vectorizer=False,
|
66 |
save_path=vector_save_dir, save_vectorizer_file=False
|
pipelines/recruiter_pipeline.py
CHANGED
@@ -2,72 +2,138 @@ import argparse
|
|
2 |
import os
|
3 |
import joblib
|
4 |
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
-
|
|
|
|
|
6 |
from src.utils.file_reader import extract_text_from_file
|
7 |
from src.processing.text_cleaning import clean_text
|
8 |
|
9 |
|
10 |
-
def
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
|
|
|
|
|
|
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
raw_job_text = extract_text_from_file(args.job_desc_path)
|
23 |
-
cleaned_job_text = clean_text(raw_job_text)
|
24 |
-
job_vector = vectorizer.transform([cleaned_job_text])
|
25 |
|
|
|
|
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
35 |
|
36 |
-
|
|
|
|
|
37 |
|
38 |
|
39 |
-
|
40 |
-
|
|
|
|
|
41 |
|
|
|
|
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
top_k = min(args.top_k, len(ranked))
|
46 |
|
|
|
|
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
print(f"{i}. {fname} β score: {score:.4f}")
|
52 |
|
53 |
-
|
54 |
-
if args.debug:
|
55 |
-
print("\n===== DEBUG MODE =====")
|
56 |
-
print("\nπ Cleaned Job Description Preview:\n", cleaned_job_text[:1000])
|
57 |
-
print("\nπ Raw Similarity Scores:\n", sims)
|
58 |
-
print("=======================")
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
except Exception as e:
|
61 |
print(f"β Error: {str(e)}")
|
62 |
|
63 |
|
64 |
if __name__ == "__main__":
|
65 |
-
parser = argparse.ArgumentParser(description="Recruiter Pipeline: Rank resumes for a given job description")
|
|
|
|
|
66 |
parser.add_argument('--job_desc_path', type=str, required=True, help="Path to job description file")
|
67 |
parser.add_argument('--resume_dir', type=str, required=True, help="Directory containing applicant resumes")
|
68 |
-
parser.add_argument('--
|
|
|
69 |
parser.add_argument('--top_k', type=int, default=10, help="Number of top resumes to return")
|
70 |
-
parser.add_argument('--debug', action='store_true', help="Print cleaned
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
args = parser.parse_args()
|
73 |
main(args)
|
|
|
2 |
import os
|
3 |
import joblib
|
4 |
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
+
import faiss
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
+
from src.utils.bulk_loading import bulk_load_raw_resume_files
|
8 |
from src.utils.file_reader import extract_text_from_file
|
9 |
from src.processing.text_cleaning import clean_text
|
10 |
|
11 |
|
12 |
+
def rank_with_tfidf(args, raw_job_text, raw_resume_texts):
|
13 |
+
"""TF-IDF recruiter pipeline"""
|
14 |
+
# Step 1: Load vectorizer
|
15 |
+
if not os.path.exists(args.vectorizer_path):
|
16 |
+
raise FileNotFoundError(f"β οΈ Vectorizer file not found: {args.vectorizer_path}")
|
17 |
+
vectorizer = joblib.load(args.vectorizer_path)
|
18 |
|
19 |
+
# Step 2: Process job description
|
20 |
+
cleaned_job_text = clean_text(raw_job_text)
|
21 |
+
job_vector = vectorizer.transform([cleaned_job_text])
|
22 |
|
23 |
+
# Step 3: Process resumes
|
24 |
+
cleaned_resumes = {fname: clean_text(txt) for fname, txt in raw_resume_texts.items()}
|
25 |
+
resume_matrix = vectorizer.transform(cleaned_resumes.values())
|
|
|
|
|
|
|
|
|
26 |
|
27 |
+
# Step 4: Compute similarity
|
28 |
+
sims = cosine_similarity(job_vector, resume_matrix)[0]
|
29 |
|
30 |
+
if args.debug:
|
31 |
+
print("\n================ DEBUG MODE ================")
|
32 |
+
print("\n[DEBUG - TFIDF] Cleaned job description:")
|
33 |
+
print(cleaned_job_text[:500], "...\n")
|
34 |
+
print("[DEBUG - TFIDF] First 3 cleaned resumes:")
|
35 |
+
for i, (fname, txt) in enumerate(cleaned_resumes.items()):
|
36 |
+
if i >= 3: break
|
37 |
+
print(f"{fname}: {txt[:300]}...\n")
|
38 |
+
print("[DEBUG - TFIDF] Raw similarity scores:", sims[:10])
|
39 |
+
print("==============================================")
|
40 |
|
41 |
+
# Step 5: Rank resumes
|
42 |
+
ranked = sorted(zip(cleaned_resumes.keys(), sims), key=lambda x: x[1], reverse=True)
|
43 |
+
return ranked
|
44 |
|
45 |
|
46 |
+
def rank_with_bert(args, raw_job_text, raw_resume_texts):
|
47 |
+
"""BERT recruiter pipeline using FAISS (on the fly)"""
|
48 |
+
if not os.path.exists(args.bert_model_path):
|
49 |
+
raise FileNotFoundError(f"β οΈ BERT model not found: {args.bert_model_path}")
|
50 |
|
51 |
+
# Step 1: Load BERT model
|
52 |
+
model = SentenceTransformer(args.bert_model_path)
|
53 |
|
54 |
+
# Step 2: Encode job description
|
55 |
+
job_embedding = model.encode([raw_job_text], convert_to_numpy=True, normalize_embeddings=True)
|
|
|
56 |
|
57 |
+
# Step 3: Encode resumes
|
58 |
+
resume_embeddings = model.encode(list(raw_resume_texts.values()), convert_to_numpy=True, normalize_embeddings=True)
|
59 |
|
60 |
+
# Step 4: Create FAISS indices
|
61 |
+
local_index = faiss.IndexFlatIP(resume_embeddings.shape[1])
|
62 |
+
local_index.add(resume_embeddings)
|
|
|
63 |
|
64 |
+
scores, indices = local_index.search(job_embedding, len(raw_resume_texts))
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
+
if args.debug:
|
67 |
+
print("\n================ DEBUG MODE ================")
|
68 |
+
print("\n[DEBUG - BERT/FAISS] Raw job description:")
|
69 |
+
print(raw_job_text[:500], "...\n")
|
70 |
+
print("[DEBUG - BERT/FAISS] First 3 raw resumes:")
|
71 |
+
for i, (fname, txt) in enumerate(raw_resume_texts.items()):
|
72 |
+
if i >= 3: break
|
73 |
+
print(f"{fname}: {txt[:300]}...\n")
|
74 |
+
print(f"[DEBUG - BERT/FAISS] all similarity scores:", scores[0][:len(raw_resume_texts)])
|
75 |
+
print("==============================================")
|
76 |
+
|
77 |
+
# Step 5: Rank resumes
|
78 |
+
ranked = [(list(raw_resume_texts.keys())[i], float(scores[0][j]))
|
79 |
+
for j, i in enumerate(indices[0])]
|
80 |
+
return ranked
|
81 |
+
|
82 |
+
|
83 |
+
def main(args):
|
84 |
+
try:
|
85 |
+
# Load raw job and resumes
|
86 |
+
raw_job_text = extract_text_from_file(args.job_desc_path)
|
87 |
+
raw_resume_texts = bulk_load_raw_resume_files(args.resume_dir)
|
88 |
+
|
89 |
+
if not raw_resume_texts:
|
90 |
+
raise ValueError("β οΈ No valid resumes found in the given directory.")
|
91 |
+
|
92 |
+
# Limit the number of resumes displayed based on the top_k argument and available resumes
|
93 |
+
available_resumes = len(raw_resume_texts)
|
94 |
+
top_k = min(args.top_k, available_resumes)
|
95 |
+
|
96 |
+
if args.top_k > available_resumes:
|
97 |
+
print(f"\nβ οΈ Only {available_resumes} resumes are available. "
|
98 |
+
f"Showing top {available_resumes} matches instead of {args.top_k}.\n")
|
99 |
+
|
100 |
+
# Choose model
|
101 |
+
if args.model == "tfidf":
|
102 |
+
ranked = rank_with_tfidf(args, raw_job_text, raw_resume_texts)
|
103 |
+
elif args.model == "bert":
|
104 |
+
ranked = rank_with_bert(args, raw_job_text, raw_resume_texts)
|
105 |
+
else:
|
106 |
+
raise ValueError("β Invalid model. Choose 'tfidf' or 'bert'.")
|
107 |
+
|
108 |
+
# Display ranked resumes
|
109 |
+
print(f"\nπ― Top {top_k} Candidate Matches for the Job ({args.model.upper()}):")
|
110 |
+
for i, (fname, score) in enumerate(ranked[:top_k], 1):
|
111 |
+
print(f"{i}. {fname} β score: {score:.4f}")
|
112 |
+
|
113 |
except Exception as e:
|
114 |
print(f"β Error: {str(e)}")
|
115 |
|
116 |
|
117 |
if __name__ == "__main__":
|
118 |
+
parser = argparse.ArgumentParser(description="Recruiter Pipeline: Rank user uploaded resumes for a given job description")
|
119 |
+
|
120 |
+
# Shared arguments
|
121 |
parser.add_argument('--job_desc_path', type=str, required=True, help="Path to job description file")
|
122 |
parser.add_argument('--resume_dir', type=str, required=True, help="Directory containing applicant resumes")
|
123 |
+
parser.add_argument('--model', type=str, choices=['tfidf', 'bert'], default='tfidf',
|
124 |
+
help="Model to use: tfidf or bert")
|
125 |
parser.add_argument('--top_k', type=int, default=10, help="Number of top resumes to return")
|
126 |
+
parser.add_argument('--debug', action='store_true', help="Print cleaned/raw texts and raw similarity scores")
|
127 |
+
|
128 |
+
# TF-IDF specific
|
129 |
+
parser.add_argument('--vectorizer_path', type=str,
|
130 |
+
default='models/tfidf/recruiter_tfidf/combined_tfidf_vectorizer.pkl',
|
131 |
+
help="Path to pre-trained TF-IDF vectorizer")
|
132 |
+
|
133 |
+
# BERT specific
|
134 |
+
parser.add_argument('--bert_model_path', type=str,
|
135 |
+
default='models/bert/dapt_minilm_sentence_transformer',
|
136 |
+
help="Path to fine-tuned BERT/SBERT model")
|
137 |
|
138 |
args = parser.parse_args()
|
139 |
main(args)
|
src/feature_engg/tfidf_vectorizing_data.py
CHANGED
@@ -27,14 +27,14 @@ def get_combined_tfidf_vectorizer(max_features: int = 40000,
|
|
27 |
Creates a TF-IDF vectorizer with specified parameters for larger vocab with both Jobs and Resume.
|
28 |
"""
|
29 |
return TfidfVectorizer(
|
30 |
-
stop_words="english",
|
31 |
-
lowercase=True,
|
32 |
max_features=max_features, # Balanced for resumes + jobs
|
33 |
-
ngram_range=ngram_range,
|
34 |
-
min_df=5,
|
35 |
-
max_df=0.85,
|
36 |
sublinear_tf=True, # Smooth term frequency scaling
|
37 |
-
norm="l2"
|
38 |
)
|
39 |
|
40 |
def save_vectorizer(vectorizer: TfidfVectorizer,
|
|
|
27 |
Creates a TF-IDF vectorizer with specified parameters for larger vocab with both Jobs and Resume.
|
28 |
"""
|
29 |
return TfidfVectorizer(
|
30 |
+
stop_words="english", # Remove common English stopwords
|
31 |
+
lowercase=True, # Convert all to lowercase
|
32 |
max_features=max_features, # Balanced for resumes + jobs
|
33 |
+
ngram_range=ngram_range, # By default Unigrams + Bigrams
|
34 |
+
min_df=5, # Ignore very rare words
|
35 |
+
max_df=0.85, # Ignore very common words
|
36 |
sublinear_tf=True, # Smooth term frequency scaling
|
37 |
+
norm="l2" # Normalize for cosine similarity
|
38 |
)
|
39 |
|
40 |
def save_vectorizer(vectorizer: TfidfVectorizer,
|
src/utils/bulk_loading.py
CHANGED
@@ -1,21 +1,20 @@
|
|
1 |
import os
|
2 |
-
from typing import List,
|
3 |
-
|
4 |
-
from src.processing.text_cleaning import clean_text # assuming you already have this
|
5 |
from src.utils.file_reader import extract_text_from_pdf, extract_text_from_docx, extract_text_from_txt
|
6 |
|
7 |
-
def
|
8 |
"""
|
9 |
-
Load multiple resumes from a directory
|
10 |
|
11 |
Args:
|
12 |
input_path : str or List[str]
|
13 |
Either:
|
14 |
- A path to a directory containing resume files, OR
|
15 |
-
- A list of individual file paths
|
|
|
16 |
|
17 |
Returns:
|
18 |
-
Dict[str, str]: Dictionary mapping file's basenames ->
|
19 |
"""
|
20 |
resumes = {}
|
21 |
|
@@ -26,9 +25,11 @@ def bulk_load_cleaned_resume_files(input_path: Union[str, List[str]]) -> Dict[st
|
|
26 |
for f in os.listdir(input_path)
|
27 |
if f.lower().endswith((".pdf", ".docx", ".txt"))
|
28 |
]
|
|
|
29 |
# Case 2: list of files
|
30 |
elif isinstance(input_path, list):
|
31 |
file_paths = input_path
|
|
|
32 |
# Case 3: single file
|
33 |
elif isinstance(input_path, str) and os.path.isfile(input_path):
|
34 |
file_paths = [input_path]
|
@@ -49,13 +50,12 @@ def bulk_load_cleaned_resume_files(input_path: Union[str, List[str]]) -> Dict[st
|
|
49 |
else:
|
50 |
print(f"β οΈ Skipping unsupported file type: {path}")
|
51 |
continue
|
52 |
-
|
53 |
-
|
54 |
-
resumes[os.path.basename(path)] =
|
55 |
|
56 |
except Exception as e:
|
57 |
print(f"β Error processing {path}: {e}")
|
58 |
|
59 |
print(f"β
Loaded {len(resumes)} resumes.")
|
60 |
-
|
61 |
return resumes
|
|
|
1 |
import os
|
2 |
+
from typing import List, Union
|
|
|
|
|
3 |
from src.utils.file_reader import extract_text_from_pdf, extract_text_from_docx, extract_text_from_txt
|
4 |
|
5 |
+
def bulk_load_raw_resume_files(input_path: Union[str, List[str]]):
|
6 |
"""
|
7 |
+
Load multiple resumes from a directory, a list of files, or a single file.
|
8 |
|
9 |
Args:
|
10 |
input_path : str or List[str]
|
11 |
Either:
|
12 |
- A path to a directory containing resume files, OR
|
13 |
+
- A list of individual file paths, OR
|
14 |
+
- A single file path.
|
15 |
|
16 |
Returns:
|
17 |
+
Dict[str, str]: Dictionary mapping file's basenames -> raw text.
|
18 |
"""
|
19 |
resumes = {}
|
20 |
|
|
|
25 |
for f in os.listdir(input_path)
|
26 |
if f.lower().endswith((".pdf", ".docx", ".txt"))
|
27 |
]
|
28 |
+
|
29 |
# Case 2: list of files
|
30 |
elif isinstance(input_path, list):
|
31 |
file_paths = input_path
|
32 |
+
|
33 |
# Case 3: single file
|
34 |
elif isinstance(input_path, str) and os.path.isfile(input_path):
|
35 |
file_paths = [input_path]
|
|
|
50 |
else:
|
51 |
print(f"β οΈ Skipping unsupported file type: {path}")
|
52 |
continue
|
53 |
+
|
54 |
+
# β
Add to dictionary
|
55 |
+
resumes[os.path.basename(path)] = text
|
56 |
|
57 |
except Exception as e:
|
58 |
print(f"β Error processing {path}: {e}")
|
59 |
|
60 |
print(f"β
Loaded {len(resumes)} resumes.")
|
|
|
61 |
return resumes
|