Spaces:
Sleeping
Sleeping
Commit
·
65281cf
1
Parent(s):
b4c7536
Add DAPT finetuned BERT functionalities to the app pipeline
Browse files- .gitignore +1 -1
- pipelines/app_pipeline.py +95 -33
- pipelines/recruiter_pipeline.py +1 -1
- python +0 -0
- src/feature_engg/bert_embedding_data.py +137 -0
- src/feature_engg/{vectorizing_data.py → tfidf_vectorizing_data.py} +5 -5
- src/fine_tuning/domain_adaptive_bert.py +194 -0
- src/matching/matching_engine.py +31 -3
.gitignore
CHANGED
@@ -211,8 +211,8 @@ Resume.csv
|
|
211 |
job_descriptions.csv
|
212 |
understanding_data.ipynb
|
213 |
data/processed/*.csv
|
|
|
214 |
data/raw/*/*csv
|
215 |
data/saved_plots/
|
216 |
-
test_vectorization.py
|
217 |
models/
|
218 |
tests/
|
|
|
211 |
job_descriptions.csv
|
212 |
understanding_data.ipynb
|
213 |
data/processed/*.csv
|
214 |
+
data/processed/*.txt
|
215 |
data/raw/*/*csv
|
216 |
data/saved_plots/
|
|
|
217 |
models/
|
218 |
tests/
|
pipelines/app_pipeline.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1 |
import argparse
|
2 |
import os
|
3 |
import pandas as pd
|
4 |
-
|
|
|
|
|
5 |
from src.processing.text_cleaning import clean_text
|
6 |
-
from src.matching.matching_engine import compute_similarity_matrix,
|
7 |
from src.utils.file_reader import extract_text_from_file
|
8 |
|
9 |
|
@@ -14,41 +16,86 @@ def load_job_titles(job_csv_path: str):
|
|
14 |
return df
|
15 |
|
16 |
|
17 |
-
def
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
|
30 |
-
|
31 |
-
|
32 |
|
33 |
-
|
34 |
-
|
35 |
|
36 |
-
|
37 |
-
|
|
|
|
|
38 |
|
39 |
-
|
40 |
-
|
41 |
|
42 |
-
|
43 |
-
|
44 |
-
print(f"🔹 {job_df.iloc[job_idx]['title']} (score: {score:0.4f})")
|
45 |
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
except Exception as e:
|
54 |
print(f"❌ Error: {str(e)}")
|
@@ -57,11 +104,26 @@ def main(args):
|
|
57 |
if __name__ == "__main__":
|
58 |
parser = argparse.ArgumentParser(description="Match a resume to top relevant job titles")
|
59 |
parser.add_argument('--resume_path', type=str, required=True, help="Path to resume file")
|
60 |
-
parser.add_argument('--
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
parser.add_argument('--job_title_csv', type=str, default='data/app_data/job_titles.csv')
|
63 |
-
parser.add_argument('--top_k', type=int, default=5,
|
64 |
-
|
|
|
|
|
65 |
|
66 |
args = parser.parse_args()
|
67 |
main(args)
|
|
|
1 |
import argparse
|
2 |
import os
|
3 |
import pandas as pd
|
4 |
+
import faiss
|
5 |
+
from sentence_transformers import SentenceTransformer
|
6 |
+
from src.feature_engg.tfidf_vectorizing_data import load_vectorizer, load_vector_data
|
7 |
from src.processing.text_cleaning import clean_text
|
8 |
+
from src.matching.matching_engine import compute_similarity_matrix, top_n_tfidf_matches, top_n_bert_matches
|
9 |
from src.utils.file_reader import extract_text_from_file
|
10 |
|
11 |
|
|
|
16 |
return df
|
17 |
|
18 |
|
19 |
+
def run_tfidf_pipeline(args, raw_resume: str):
|
20 |
+
|
21 |
+
# Step 2: Clean resume text
|
22 |
+
cleaned_resume = clean_text(raw_resume)
|
23 |
+
|
24 |
+
# Step 3: Load vectorizer and job matrix
|
25 |
+
vectorizer = load_vectorizer(args.vectorizer_path)
|
26 |
+
job_matrix = load_vector_data(args.job_matrix_path)
|
27 |
+
|
28 |
+
# Step 4: Vectorize cleaned resume text
|
29 |
+
resume_vector = vectorizer.transform([cleaned_resume])
|
30 |
+
|
31 |
+
# Step 5: Compute similarity
|
32 |
+
sim_matrix = compute_similarity_matrix(resume_vector, job_matrix)
|
33 |
+
|
34 |
+
# Step 6: Load job titles
|
35 |
+
job_df = load_job_titles(args.job_title_csv)
|
36 |
+
|
37 |
+
# Step 7: Get top-N job matches
|
38 |
+
matches = top_n_tfidf_matches(sim_matrix, top_n=args.top_k, job_df=job_df)
|
39 |
+
|
40 |
+
print(f"\n🎯 Top {args.top_k} Job Matches for the Resume (TF-IDF):")
|
41 |
+
for job_idx, score in matches[0]:
|
42 |
+
print(f"🔹 {job_df.iloc[job_idx]['title']} (score: {score:0.4f})")
|
43 |
+
|
44 |
+
# Optional debug
|
45 |
+
if args.debug:
|
46 |
+
print("\n======= DEBUG MODE =======")
|
47 |
+
print("\n📄 Cleaned Resume Preview:\n", cleaned_resume[:1000])
|
48 |
+
print("\n--- Raw TF-IDF Similarity Scores (Top-K) ---")
|
49 |
+
for job_idx, score in matches[0]:
|
50 |
+
print(f"[{job_idx}] {job_df.iloc[job_idx]['title']} → {score:0.6f}")
|
51 |
+
print("============================")
|
52 |
|
53 |
|
54 |
+
def run_bert_pipeline(args, raw_resume: str):
|
55 |
+
# Step 2: Load SentenceTransformer model
|
56 |
+
model = SentenceTransformer(args.bert_model_path)
|
57 |
|
58 |
+
# Step 3: Load FAISS job index
|
59 |
+
job_index = faiss.read_index(args.bert_faiss_index)
|
60 |
|
61 |
+
# Step 4: Encode resume into embedding
|
62 |
+
resume_embedding = model.encode([raw_resume], normalize_embeddings=True)
|
63 |
|
64 |
+
# Step 5: Search deeply in FAISS index in order to eliminate duplicate job titles
|
65 |
+
# Search across all job embeddings in FAISS
|
66 |
+
n_jobs = job_index.ntotal
|
67 |
+
D, I = job_index.search(resume_embedding, n_jobs)
|
68 |
|
69 |
+
# Step 6: Load job titles
|
70 |
+
job_df = load_job_titles(args.job_title_csv)
|
71 |
|
72 |
+
print(f"\n🎯 Top {args.top_k} Job Matches for the Resume (BERT):")
|
73 |
+
matches = top_n_bert_matches(I, D, job_df, top_n=args.top_k)
|
|
|
74 |
|
75 |
+
for idx, score in matches:
|
76 |
+
print(f"🔹 {job_df.iloc[idx]['title']} (score: {score:0.4f})")
|
77 |
+
|
78 |
+
# Optional debug
|
79 |
+
if args.debug:
|
80 |
+
print("\n======= DEBUG MODE =======")
|
81 |
+
print("\n--- Raw BERT/FAISS Similarity Scores (Top-K) ---")
|
82 |
+
for idx, score in matches:
|
83 |
+
print(f"🔹 {job_df.iloc[idx]['title']} (score: {score})")
|
84 |
+
print("============================")
|
85 |
+
|
86 |
+
|
87 |
+
def main(args):
|
88 |
+
try:
|
89 |
+
# Step 1: Load raw resume text
|
90 |
+
if not os.path.exists(args.resume_path):
|
91 |
+
raise FileNotFoundError(f"Resume file not found: {args.resume_path}")
|
92 |
+
raw_resume = extract_text_from_file(args.resume_path)
|
93 |
+
|
94 |
+
# Run chosen pipeline
|
95 |
+
if args.model == "bert":
|
96 |
+
run_bert_pipeline(args, raw_resume)
|
97 |
+
else:
|
98 |
+
run_tfidf_pipeline(args, raw_resume)
|
99 |
|
100 |
except Exception as e:
|
101 |
print(f"❌ Error: {str(e)}")
|
|
|
104 |
if __name__ == "__main__":
|
105 |
parser = argparse.ArgumentParser(description="Match a resume to top relevant job titles")
|
106 |
parser.add_argument('--resume_path', type=str, required=True, help="Path to resume file")
|
107 |
+
parser.add_argument('--model', type=str, choices=['tfidf', 'bert'], default='tfidf',
|
108 |
+
help="Which model pipeline to use: 'tfidf' or 'bert'")
|
109 |
+
|
110 |
+
|
111 |
+
# TF-IDF arguments
|
112 |
+
parser.add_argument('--vectorizer_path', type=str, default='models/tfidf/app_tfidf/job_tfidf_vectorizer.pkl')
|
113 |
+
parser.add_argument('--job_matrix_path', type=str, default='models/tfidf/app_tfidf/job_tfidf_matrix.npz')
|
114 |
+
|
115 |
+
# BERT arguments
|
116 |
+
parser.add_argument('--bert_model_path', type=str, default='models/bert/dapt_minilm_sentence_transformer',
|
117 |
+
help="Path to fine-tuned SentenceTransformer model")
|
118 |
+
parser.add_argument('--bert_faiss_index', type=str, default='models/bert/app_bert/jobs_bert_embeddings.faiss',
|
119 |
+
help="Path to FAISS index of job embeddings")
|
120 |
+
|
121 |
+
# Shared arguments
|
122 |
parser.add_argument('--job_title_csv', type=str, default='data/app_data/job_titles.csv')
|
123 |
+
parser.add_argument('--top_k', type=int, default=5,
|
124 |
+
help="Number of top job matches to return")
|
125 |
+
parser.add_argument('--debug', action='store_true',
|
126 |
+
help="Print raw similarity scores and cleaned resume for tfidf pipeline")
|
127 |
|
128 |
args = parser.parse_args()
|
129 |
main(args)
|
pipelines/recruiter_pipeline.py
CHANGED
@@ -65,7 +65,7 @@ if __name__ == "__main__":
|
|
65 |
parser = argparse.ArgumentParser(description="Recruiter Pipeline: Rank resumes for a given job description")
|
66 |
parser.add_argument('--job_desc_path', type=str, required=True, help="Path to job description file")
|
67 |
parser.add_argument('--resume_dir', type=str, required=True, help="Directory containing applicant resumes")
|
68 |
-
parser.add_argument('--vectorizer_path', type=str, default='models/recruiter_tfidf/combined_tfidf_vectorizer.pkl')
|
69 |
parser.add_argument('--top_k', type=int, default=10, help="Number of top resumes to return")
|
70 |
parser.add_argument('--debug', action='store_true', help="Print cleaned job/resume text and raw matches")
|
71 |
|
|
|
65 |
parser = argparse.ArgumentParser(description="Recruiter Pipeline: Rank resumes for a given job description")
|
66 |
parser.add_argument('--job_desc_path', type=str, required=True, help="Path to job description file")
|
67 |
parser.add_argument('--resume_dir', type=str, required=True, help="Directory containing applicant resumes")
|
68 |
+
parser.add_argument('--vectorizer_path', type=str, default='models/tfidf/recruiter_tfidf/combined_tfidf_vectorizer.pkl')
|
69 |
parser.add_argument('--top_k', type=int, default=10, help="Number of top resumes to return")
|
70 |
parser.add_argument('--debug', action='store_true', help="Print cleaned job/resume text and raw matches")
|
71 |
|
python
ADDED
File without changes
|
src/feature_engg/bert_embedding_data.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import torch
|
5 |
+
from typing import Optional
|
6 |
+
from sentence_transformers import SentenceTransformer, models
|
7 |
+
import faiss
|
8 |
+
|
9 |
+
|
10 |
+
def get_bert_model(model_name: str = "all-MiniLM-L6-v2",
|
11 |
+
device: str = None):
|
12 |
+
"""
|
13 |
+
Loads a BERT-based sentence transformer model for embeddings.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
model_name (str): HuggingFace model name. Default is "all-MiniLM-L6-v2".
|
17 |
+
device (str, optional): "cuda", "cpu", or None (auto-detect).
|
18 |
+
|
19 |
+
Returns:
|
20 |
+
SentenceTransformer: Loaded model ready for encoding.
|
21 |
+
"""
|
22 |
+
device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
23 |
+
return SentenceTransformer(model_name, device=device)
|
24 |
+
|
25 |
+
|
26 |
+
def save_bert_embeddings(embeddings: np.ndarray,
|
27 |
+
path: str):
|
28 |
+
"""
|
29 |
+
Save dense BERT embeddings as a FAISS index file (.faiss).
|
30 |
+
"""
|
31 |
+
|
32 |
+
if not path.endswith('.faiss'):
|
33 |
+
path += '.faiss'
|
34 |
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
35 |
+
|
36 |
+
embedding_dimension = embeddings.shape[1]
|
37 |
+
index = faiss.IndexFlatIP(embedding_dimension) # Inner Product (cosine if normalized)
|
38 |
+
index.add(embeddings)
|
39 |
+
|
40 |
+
faiss.write_index(index, path)
|
41 |
+
print(f"✅ BERT embeddings saved to FAISS index: [{path}] "
|
42 |
+
f"with {index.ntotal} vectors, dim={embedding_dimension}")
|
43 |
+
|
44 |
+
|
45 |
+
def save_bert_model(vectorizer: SentenceTransformer,
|
46 |
+
path: str):
|
47 |
+
"""Save the full SentenceTransformer model to disk."""
|
48 |
+
|
49 |
+
os.makedirs(path, exist_ok=True)
|
50 |
+
vectorizer.save(path)
|
51 |
+
print(f"✅ BERT model saved to: [{path}]")
|
52 |
+
|
53 |
+
|
54 |
+
def bert_embed_text(df: pd.DataFrame,
|
55 |
+
text_column: str,
|
56 |
+
label: str,
|
57 |
+
model: Optional[SentenceTransformer] = None,
|
58 |
+
save_path: Optional[str] = None,
|
59 |
+
save_model_file: bool = False):
|
60 |
+
"""
|
61 |
+
Encodes text from a DataFrame into dense BERT embeddings.
|
62 |
+
|
63 |
+
To save the embeddings and model, ensure 'save_path' is provided along with a valid 'label'.
|
64 |
+
|
65 |
+
Args:
|
66 |
+
df (pd.DataFrame): DataFrame containing the text to be encoded.
|
67 |
+
text_column (str): Column with text to be encoded.
|
68 |
+
label (str): Label prefix for saved files (e.g., 'resumes', 'jobs').
|
69 |
+
model (SentenceTransformer, optional): Preloaded model.
|
70 |
+
save_path (str, optional): Directory to save outputs.
|
71 |
+
save_model_file (bool): If True, also saves the model reference.
|
72 |
+
|
73 |
+
Returns:
|
74 |
+
tuple: (embeddings ndarray, model)
|
75 |
+
"""
|
76 |
+
|
77 |
+
if df[text_column].isnull().any():
|
78 |
+
print(f"\n⚠️ Found missing values in column '{text_column}', replacing with empty string.")
|
79 |
+
df[text_column] = df[text_column].fillna("")
|
80 |
+
|
81 |
+
if model is None:
|
82 |
+
model = get_bert_model()
|
83 |
+
|
84 |
+
embeddings = model.encode(
|
85 |
+
df[text_column].tolist(),
|
86 |
+
convert_to_numpy=True,
|
87 |
+
show_progress_bar=True,
|
88 |
+
normalize_embeddings=True # normalizing as it is good for cosine similarity.
|
89 |
+
)
|
90 |
+
|
91 |
+
if save_path and label:
|
92 |
+
save_bert_embeddings(embeddings, os.path.join(save_path, f"{label}_bert_embeddings.faiss"))
|
93 |
+
if save_model_file:
|
94 |
+
save_bert_model(model, os.path.join(save_path, f"{label}_bert_model"))
|
95 |
+
|
96 |
+
return embeddings, model
|
97 |
+
|
98 |
+
|
99 |
+
def load_bert_embeddings(path: str):
|
100 |
+
"""
|
101 |
+
Load a FAISS index file (.faiss) from disk.
|
102 |
+
"""
|
103 |
+
if not path.endswith('.faiss'):
|
104 |
+
path += '.faiss'
|
105 |
+
return faiss.read_index(path)
|
106 |
+
|
107 |
+
|
108 |
+
def load_bert_model(path: str):
|
109 |
+
"""Load a saved SentenceTransformer model."""
|
110 |
+
|
111 |
+
return SentenceTransformer(path)
|
112 |
+
|
113 |
+
|
114 |
+
def convert_hf_model_to_st(hf_model_path: str,
|
115 |
+
st_model_path: str):
|
116 |
+
"""
|
117 |
+
Converts a HuggingFace model to a SentenceTransformer model.
|
118 |
+
|
119 |
+
Needed as fine-tuning was performed using HuggingFace's Transformers library.
|
120 |
+
|
121 |
+
Args:
|
122 |
+
hf_model_path (str): Path to the HuggingFace model.
|
123 |
+
st_model_path (str): Path to save the SentenceTransformer model.
|
124 |
+
|
125 |
+
Returns:
|
126 |
+
None: Saves the SentenceTransformer model to the specified path.
|
127 |
+
"""
|
128 |
+
# Build SentenceTransformer from HF model
|
129 |
+
word_embedding_model = models.Transformer(hf_model_path)
|
130 |
+
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
|
131 |
+
st_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
|
132 |
+
|
133 |
+
# Save to the provided path
|
134 |
+
os.makedirs(st_model_path, exist_ok=True)
|
135 |
+
st_model.save(st_model_path)
|
136 |
+
print(f"✅ Converted HuggingFace model [{hf_model_path}] "
|
137 |
+
f"to SentenceTransformer at [{st_model_path}]")
|
src/feature_engg/{vectorizing_data.py → tfidf_vectorizing_data.py}
RENAMED
@@ -29,16 +29,16 @@ def get_combined_tfidf_vectorizer(max_features: int = 40000,
|
|
29 |
return TfidfVectorizer(
|
30 |
stop_words="english",
|
31 |
lowercase=True,
|
32 |
-
max_features=max_features,
|
33 |
ngram_range=ngram_range,
|
34 |
min_df=5,
|
35 |
max_df=0.85,
|
36 |
-
sublinear_tf=True,
|
37 |
norm="l2"
|
38 |
)
|
39 |
|
40 |
def save_vectorizer(vectorizer: TfidfVectorizer,
|
41 |
-
path: str = 'models/dev_tfidf/tfidf_vectorizer.pkl'):
|
42 |
|
43 |
"""
|
44 |
Saves a TfidfVectorizer object to a given path. Appends .pkl if missing.
|
@@ -64,9 +64,9 @@ def save_vector_data(matrix: csr_matrix, path: str):
|
|
64 |
print(f"✅ TF-IDF matrix saved to: [{path}]")
|
65 |
|
66 |
|
67 |
-
def
|
68 |
text_column: str,
|
69 |
-
label: str,
|
70 |
vectorizer: Optional[TfidfVectorizer] = None,
|
71 |
fit_vectorizer: bool = False,
|
72 |
save_path: Optional[str] = None,
|
|
|
29 |
return TfidfVectorizer(
|
30 |
stop_words="english",
|
31 |
lowercase=True,
|
32 |
+
max_features=max_features, # Balanced for resumes + jobs
|
33 |
ngram_range=ngram_range,
|
34 |
min_df=5,
|
35 |
max_df=0.85,
|
36 |
+
sublinear_tf=True, # Smooth term frequency scaling
|
37 |
norm="l2"
|
38 |
)
|
39 |
|
40 |
def save_vectorizer(vectorizer: TfidfVectorizer,
|
41 |
+
path: str = 'models/tfidf/dev_tfidf/tfidf_vectorizer.pkl'):
|
42 |
|
43 |
"""
|
44 |
Saves a TfidfVectorizer object to a given path. Appends .pkl if missing.
|
|
|
64 |
print(f"✅ TF-IDF matrix saved to: [{path}]")
|
65 |
|
66 |
|
67 |
+
def tfidf_vectorize_text(df: pd.DataFrame,
|
68 |
text_column: str,
|
69 |
+
label: str,
|
70 |
vectorizer: Optional[TfidfVectorizer] = None,
|
71 |
fit_vectorizer: bool = False,
|
72 |
save_path: Optional[str] = None,
|
src/fine_tuning/domain_adaptive_bert.py
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from datasets import Dataset
|
3 |
+
import argparse
|
4 |
+
from sklearn.model_selection import train_test_split
|
5 |
+
from transformers import (AutoTokenizer,
|
6 |
+
AutoModelForMaskedLM,
|
7 |
+
DataCollatorForLanguageModeling,
|
8 |
+
Trainer,
|
9 |
+
TrainingArguments,
|
10 |
+
EarlyStoppingCallback)
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
def run_dapt(corpus_path: str,
|
15 |
+
model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
|
16 |
+
output_dir: str = "models/bert/dapt_minilm",
|
17 |
+
num_train_epochs: int = 3,
|
18 |
+
per_device_train_batch_size: int = 32,
|
19 |
+
learning_rate: float = 5e-5,
|
20 |
+
warmup_steps: int = 0,
|
21 |
+
save_total_limit: int = 2,
|
22 |
+
logging_steps: int = 100,
|
23 |
+
max_seq_length: int = 256,
|
24 |
+
val_split: float = 0.1,
|
25 |
+
early_stopping_patience: int = 2,
|
26 |
+
early_stopping_threshold: float = 0.01,
|
27 |
+
save_best_only: bool = True):
|
28 |
+
"""
|
29 |
+
Runs Domain-Adaptive Pretraining (DAPT) on a given text corpus.
|
30 |
+
|
31 |
+
Args:
|
32 |
+
corpus_path (str): Path to the text corpus file.
|
33 |
+
model_name (str): Name of the pre-trained BERT model to use. default: "sentence-transformers/all-MiniLM-L6-v2".
|
34 |
+
output_dir (str): Directory to save the trained model. default: "models/bert/dapt_minilm".
|
35 |
+
num_train_epochs (int): Number of training epochs. default: 3.
|
36 |
+
per_device_train_batch_size (int): Batch size for training. default: 32.
|
37 |
+
learning_rate (float): Learning rate for training. default: 5e-5.
|
38 |
+
warmup_steps (int): Number of warmup steps for training. default: 0.
|
39 |
+
save_total_limit (int): Number of checkpoints to save. default: 2.
|
40 |
+
logging_steps (int): Number of steps to log. default: 100.
|
41 |
+
max_seq_length (int): Maximum sequence length for input. default: 256.
|
42 |
+
val_split (float): Fraction of the data to use for validation. default: 0.1.
|
43 |
+
early_stopping_patience (int): Number of epochs to wait for improvement before early stopping. default: 2.
|
44 |
+
early_stopping_threshold (float): Threshold for early stopping improvement. default: 0.01.
|
45 |
+
save_best_only (bool): Whether to save only the best model. default: True.
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
output_dir (str): Path to the trained model directory.
|
49 |
+
"""
|
50 |
+
|
51 |
+
# Load dataset from text file bypassing any future caching errors.
|
52 |
+
with open(corpus_path, encoding="utf-8") as f:
|
53 |
+
lines = [l.strip() for l in f if l.strip()]
|
54 |
+
|
55 |
+
if val_split > 0:
|
56 |
+
# Train/validation split
|
57 |
+
train_texts, val_texts = train_test_split(lines, test_size=val_split, random_state=42)
|
58 |
+
dataset = Dataset.from_dict({"text": train_texts})
|
59 |
+
val_dataset = Dataset.from_dict({"text": val_texts})
|
60 |
+
else:
|
61 |
+
# Use full data for training
|
62 |
+
dataset = Dataset.from_dict({"text": lines})
|
63 |
+
val_dataset = None
|
64 |
+
|
65 |
+
# Tokenizer & model
|
66 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
67 |
+
model = AutoModelForMaskedLM.from_pretrained(model_name)
|
68 |
+
|
69 |
+
# Tokenization function
|
70 |
+
def tokenize_fn(batch):
|
71 |
+
return tokenizer(
|
72 |
+
batch["text"],
|
73 |
+
truncation=True,
|
74 |
+
padding="max_length",
|
75 |
+
max_length=max_seq_length,
|
76 |
+
)
|
77 |
+
|
78 |
+
tokenized_train = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
|
79 |
+
tokenized_val = val_dataset.map(tokenize_fn, batched=True, remove_columns=["text"]) if val_dataset else None
|
80 |
+
|
81 |
+
# Data collator with dynamic masking
|
82 |
+
data_collator = DataCollatorForLanguageModeling(
|
83 |
+
tokenizer=tokenizer,
|
84 |
+
mlm=True,
|
85 |
+
mlm_probability=0.15,
|
86 |
+
)
|
87 |
+
|
88 |
+
# Base training arguments
|
89 |
+
training_args = {
|
90 |
+
"output_dir": output_dir,
|
91 |
+
"overwrite_output_dir": True,
|
92 |
+
"num_train_epochs": num_train_epochs,
|
93 |
+
"per_device_train_batch_size": per_device_train_batch_size,
|
94 |
+
"save_total_limit": save_total_limit,
|
95 |
+
"prediction_loss_only": True,
|
96 |
+
"logging_steps": logging_steps,
|
97 |
+
"learning_rate": learning_rate,
|
98 |
+
"warmup_steps": warmup_steps,
|
99 |
+
"save_strategy": "epoch",
|
100 |
+
"report_to": "none",
|
101 |
+
}
|
102 |
+
|
103 |
+
# Add validation-related args only if val_split > 0
|
104 |
+
if val_dataset:
|
105 |
+
training_args.update({
|
106 |
+
"eval_strategy": "epoch",
|
107 |
+
"load_best_model_at_end": save_best_only,
|
108 |
+
"metric_for_best_model": "eval_loss",
|
109 |
+
"greater_is_better": False,
|
110 |
+
})
|
111 |
+
|
112 |
+
|
113 |
+
training_args = TrainingArguments(**training_args)
|
114 |
+
|
115 |
+
# Callbacks
|
116 |
+
callbacks = []
|
117 |
+
if val_dataset and early_stopping_patience > 0:
|
118 |
+
callbacks.append(
|
119 |
+
EarlyStoppingCallback(
|
120 |
+
early_stopping_patience=early_stopping_patience,
|
121 |
+
early_stopping_threshold=early_stopping_threshold,
|
122 |
+
)
|
123 |
+
)
|
124 |
+
|
125 |
+
# Trainer
|
126 |
+
trainer = Trainer(
|
127 |
+
model=model,
|
128 |
+
args=training_args,
|
129 |
+
train_dataset=tokenized_train,
|
130 |
+
eval_dataset=tokenized_val if val_dataset else None,
|
131 |
+
tokenizer=tokenizer,
|
132 |
+
data_collator=data_collator,
|
133 |
+
callbacks=callbacks if val_dataset else None,
|
134 |
+
)
|
135 |
+
|
136 |
+
# Train
|
137 |
+
print("🚀 Starting Domain-Adaptive Pretraining (DAPT)...")
|
138 |
+
trainer.train()
|
139 |
+
trainer.save_model(output_dir)
|
140 |
+
tokenizer.save_pretrained(output_dir)
|
141 |
+
|
142 |
+
print(f"✅ DAPT finished! Model saved at: {output_dir}")
|
143 |
+
return output_dir
|
144 |
+
|
145 |
+
|
146 |
+
def main():
|
147 |
+
parser = argparse.ArgumentParser(description="Domain-Adaptive Pretraining (DAPT) for BERT/SBERT")
|
148 |
+
|
149 |
+
parser.add_argument("--model_name", type=str, default="sentence-transformers/all-MiniLM-L6-v2",
|
150 |
+
help="Pretrained model name or path to load")
|
151 |
+
parser.add_argument("--corpus_path", type=str, default="data/processed/domain_corpus.txt",
|
152 |
+
help="Path to plain text corpus for DAPT")
|
153 |
+
parser.add_argument("--output_dir", type=str, default="models/dapt_bert",
|
154 |
+
help="Directory to save the fine-tuned model")
|
155 |
+
parser.add_argument("--epochs", type=int, default=3,
|
156 |
+
help="Number of training epochs")
|
157 |
+
parser.add_argument("--batch_size", type=int, default=32,
|
158 |
+
help="Training batch size per device")
|
159 |
+
parser.add_argument("--learning_rate", type=float, default=5e-5,
|
160 |
+
help="Learning rate for AdamW optimizer")
|
161 |
+
parser.add_argument("--warmup_steps", type=int, default=0,
|
162 |
+
help="Number of warmup steps for LR scheduler")
|
163 |
+
parser.add_argument("--max_seq_length", type=int, default=256,
|
164 |
+
help="Maximum sequence length for inputs")
|
165 |
+
parser.add_argument("--val_split", type=float, default=0.1,
|
166 |
+
help="Fraction of data to use for validation (set 0 for no validation)")
|
167 |
+
parser.add_argument("--early_stopping_patience", type=int, default=2,
|
168 |
+
help="Number of evals with no improvement before stopping (ignored if val_split=0)")
|
169 |
+
parser.add_argument("--early_stopping_threshold", type=float, default=0.01,
|
170 |
+
help="Minimum improvement in eval loss to be considered progress (ignored if val_split=0)")
|
171 |
+
parser.add_argument("--save_best_only", action="store_true",
|
172 |
+
help="Save only the best checkpoint (ignored if val_split=0)")
|
173 |
+
|
174 |
+
args = parser.parse_args()
|
175 |
+
Path(args.output_dir).mkdir(parents=True, exist_ok=True)
|
176 |
+
|
177 |
+
run_dapt(
|
178 |
+
model_name=args.model_name,
|
179 |
+
corpus_path=args.corpus_path,
|
180 |
+
output_dir=args.output_dir,
|
181 |
+
num_train_epochs=args.epochs,
|
182 |
+
per_device_train_batch_size=args.batch_size,
|
183 |
+
learning_rate=args.learning_rate,
|
184 |
+
warmup_steps=args.warmup_steps,
|
185 |
+
max_seq_length=args.max_seq_length,
|
186 |
+
val_split=args.val_split,
|
187 |
+
early_stopping_patience=args.early_stopping_patience,
|
188 |
+
early_stopping_threshold=args.early_stopping_threshold,
|
189 |
+
save_best_only=args.save_best_only,
|
190 |
+
)
|
191 |
+
|
192 |
+
|
193 |
+
if __name__ == "__main__":
|
194 |
+
main()
|
src/matching/matching_engine.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import numpy as np
|
2 |
from sklearn.metrics.pairwise import cosine_similarity
|
3 |
-
from src.feature_engg.
|
4 |
|
5 |
def compute_similarity_matrix(X_resumes, X_jobs ):
|
6 |
"""
|
@@ -9,7 +9,7 @@ def compute_similarity_matrix(X_resumes, X_jobs ):
|
|
9 |
"""
|
10 |
return cosine_similarity(X_resumes, X_jobs)
|
11 |
|
12 |
-
def
|
13 |
top_n: int = 5,
|
14 |
job_df = None):
|
15 |
"""
|
@@ -36,6 +36,34 @@ def top_n_matches(similarity_matrix: np.ndarray,
|
|
36 |
results[i] = ranked
|
37 |
return results
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
if __name__ == "__main__":
|
40 |
# Define paths
|
41 |
resume_vec_path = "models/dev_tfidf/resumes_tfidf_matrix.npz"
|
@@ -56,7 +84,7 @@ if __name__ == "__main__":
|
|
56 |
# print(f"Min score: {np.min(all_scores):0.4f}, \nMax score: {np.max(all_scores):0.4f}, \nMean score: {np.mean(all_scores):0.4f}, \nMedian score: {np.median(all_scores):0.4f}")
|
57 |
|
58 |
# Get top 5 matches per resume
|
59 |
-
matches =
|
60 |
|
61 |
# Display example output (i.e. top_n job matches for first 5 resumes)
|
62 |
for resume_idx, top_jobs in list(matches.items())[:5]:
|
|
|
1 |
import numpy as np
|
2 |
from sklearn.metrics.pairwise import cosine_similarity
|
3 |
+
from src.feature_engg.tfidf_vectorizing_data import load_vector_data
|
4 |
|
5 |
def compute_similarity_matrix(X_resumes, X_jobs ):
|
6 |
"""
|
|
|
9 |
"""
|
10 |
return cosine_similarity(X_resumes, X_jobs)
|
11 |
|
12 |
+
def top_n_tfidf_matches(similarity_matrix: np.ndarray,
|
13 |
top_n: int = 5,
|
14 |
job_df = None):
|
15 |
"""
|
|
|
36 |
results[i] = ranked
|
37 |
return results
|
38 |
|
39 |
+
def top_n_bert_matches(indices, distances, job_df, top_n=5):
|
40 |
+
"""
|
41 |
+
Deduplicate FAISS results by job title and return top-N unique matches.
|
42 |
+
Searches across all jobs if provided.
|
43 |
+
|
44 |
+
Args:
|
45 |
+
indices (np.ndarray): Indices of nearest neighbors from FAISS (shape: [1, k]).
|
46 |
+
distances (np.ndarray): Distances/similarities from FAISS (shape: [1, k]).
|
47 |
+
job_df (pd.DataFrame): DataFrame containing job titles.
|
48 |
+
top_n (int): Number of unique top matches to return.
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
List[Tuple[int, float]]: List of (job_idx, score) for top-N unique titles.
|
52 |
+
"""
|
53 |
+
seen_titles = set()
|
54 |
+
ranked = []
|
55 |
+
|
56 |
+
for idx, score in zip(indices[0], distances[0]):
|
57 |
+
title = job_df.iloc[idx]['title']
|
58 |
+
if title not in seen_titles:
|
59 |
+
ranked.append((idx, float(score)))
|
60 |
+
seen_titles.add(title)
|
61 |
+
if len(ranked) == top_n:
|
62 |
+
break
|
63 |
+
|
64 |
+
return ranked
|
65 |
+
|
66 |
+
|
67 |
if __name__ == "__main__":
|
68 |
# Define paths
|
69 |
resume_vec_path = "models/dev_tfidf/resumes_tfidf_matrix.npz"
|
|
|
84 |
# print(f"Min score: {np.min(all_scores):0.4f}, \nMax score: {np.max(all_scores):0.4f}, \nMean score: {np.mean(all_scores):0.4f}, \nMedian score: {np.median(all_scores):0.4f}")
|
85 |
|
86 |
# Get top 5 matches per resume
|
87 |
+
matches = top_n_tfidf_matches(similarity_matrix, top_n=5)
|
88 |
|
89 |
# Display example output (i.e. top_n job matches for first 5 resumes)
|
90 |
for resume_idx, top_jobs in list(matches.items())[:5]:
|