Om-Shandilya commited on
Commit
65281cf
·
1 Parent(s): b4c7536

Add DAPT finetuned BERT functionalities to the app pipeline

Browse files
.gitignore CHANGED
@@ -211,8 +211,8 @@ Resume.csv
211
  job_descriptions.csv
212
  understanding_data.ipynb
213
  data/processed/*.csv
 
214
  data/raw/*/*csv
215
  data/saved_plots/
216
- test_vectorization.py
217
  models/
218
  tests/
 
211
  job_descriptions.csv
212
  understanding_data.ipynb
213
  data/processed/*.csv
214
+ data/processed/*.txt
215
  data/raw/*/*csv
216
  data/saved_plots/
 
217
  models/
218
  tests/
pipelines/app_pipeline.py CHANGED
@@ -1,9 +1,11 @@
1
  import argparse
2
  import os
3
  import pandas as pd
4
- from src.feature_engg.vectorizing_data import load_vectorizer, load_vector_data, vectorize_text
 
 
5
  from src.processing.text_cleaning import clean_text
6
- from src.matching.matching_engine import compute_similarity_matrix, top_n_matches
7
  from src.utils.file_reader import extract_text_from_file
8
 
9
 
@@ -14,41 +16,86 @@ def load_job_titles(job_csv_path: str):
14
  return df
15
 
16
 
17
- def main(args):
18
- try:
19
- # Step 1: Load and clean resume text (supports .pdf, .docx, .txt)
20
- if not os.path.exists(args.resume_path):
21
- raise FileNotFoundError(f"Resume file not found: {args.resume_path}")
22
- raw_resume = extract_text_from_file(args.resume_path)
23
- cleaned_resume = clean_text(raw_resume)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
 
26
- # Step 2: Load vectorizer and job matrix
27
- vectorizer = load_vectorizer(args.vectorizer_path)
28
- job_matrix = load_vector_data(args.job_matrix_path)
29
 
30
- # Step 3: Vectorize cleaned resume text
31
- resume_vector = vectorizer.transform([cleaned_resume]) # single row sparse matrix
32
 
33
- # Step 4: Compute similarity
34
- sim_matrix = compute_similarity_matrix(resume_vector, job_matrix)
35
 
36
- # Step 5: Load job titles for display
37
- job_df = load_job_titles(args.job_title_csv)
 
 
38
 
39
- # Step 6: Get top-N job matches
40
- matches = top_n_matches(sim_matrix, top_n=args.top_k, job_df=job_df)
41
 
42
- print(f"\n🎯 Top {args.top_k} Job Matches for the Resume:")
43
- for job_idx, score in matches[0]: # 0 because it's the only resume
44
- print(f"🔹 {job_df.iloc[job_idx]['title']} (score: {score:0.4f})")
45
 
46
- # Optional debug output
47
- if args.debug:
48
- print("\n===== DEBUG MODE =====")
49
- print("\n📄 Cleaned Resume Preview:\n", cleaned_resume[:1000])
50
- print("\n📊 Raw Similarity Scores:\n", sim_matrix)
51
- print("=======================")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  except Exception as e:
54
  print(f"❌ Error: {str(e)}")
@@ -57,11 +104,26 @@ def main(args):
57
  if __name__ == "__main__":
58
  parser = argparse.ArgumentParser(description="Match a resume to top relevant job titles")
59
  parser.add_argument('--resume_path', type=str, required=True, help="Path to resume file")
60
- parser.add_argument('--vectorizer_path', type=str, default='models/app_tfidf/job_tfidf_vectorizer.pkl')
61
- parser.add_argument('--job_matrix_path', type=str, default='models/app_tfidf/job_tfidf_matrix.npz')
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  parser.add_argument('--job_title_csv', type=str, default='data/app_data/job_titles.csv')
63
- parser.add_argument('--top_k', type=int, default=5, help="Number of top job matches to return")
64
- parser.add_argument('--debug', action='store_true', help="Print cleaned resume and raw matches")
 
 
65
 
66
  args = parser.parse_args()
67
  main(args)
 
1
  import argparse
2
  import os
3
  import pandas as pd
4
+ import faiss
5
+ from sentence_transformers import SentenceTransformer
6
+ from src.feature_engg.tfidf_vectorizing_data import load_vectorizer, load_vector_data
7
  from src.processing.text_cleaning import clean_text
8
+ from src.matching.matching_engine import compute_similarity_matrix, top_n_tfidf_matches, top_n_bert_matches
9
  from src.utils.file_reader import extract_text_from_file
10
 
11
 
 
16
  return df
17
 
18
 
19
+ def run_tfidf_pipeline(args, raw_resume: str):
20
+
21
+ # Step 2: Clean resume text
22
+ cleaned_resume = clean_text(raw_resume)
23
+
24
+ # Step 3: Load vectorizer and job matrix
25
+ vectorizer = load_vectorizer(args.vectorizer_path)
26
+ job_matrix = load_vector_data(args.job_matrix_path)
27
+
28
+ # Step 4: Vectorize cleaned resume text
29
+ resume_vector = vectorizer.transform([cleaned_resume])
30
+
31
+ # Step 5: Compute similarity
32
+ sim_matrix = compute_similarity_matrix(resume_vector, job_matrix)
33
+
34
+ # Step 6: Load job titles
35
+ job_df = load_job_titles(args.job_title_csv)
36
+
37
+ # Step 7: Get top-N job matches
38
+ matches = top_n_tfidf_matches(sim_matrix, top_n=args.top_k, job_df=job_df)
39
+
40
+ print(f"\n🎯 Top {args.top_k} Job Matches for the Resume (TF-IDF):")
41
+ for job_idx, score in matches[0]:
42
+ print(f"🔹 {job_df.iloc[job_idx]['title']} (score: {score:0.4f})")
43
+
44
+ # Optional debug
45
+ if args.debug:
46
+ print("\n======= DEBUG MODE =======")
47
+ print("\n📄 Cleaned Resume Preview:\n", cleaned_resume[:1000])
48
+ print("\n--- Raw TF-IDF Similarity Scores (Top-K) ---")
49
+ for job_idx, score in matches[0]:
50
+ print(f"[{job_idx}] {job_df.iloc[job_idx]['title']} → {score:0.6f}")
51
+ print("============================")
52
 
53
 
54
+ def run_bert_pipeline(args, raw_resume: str):
55
+ # Step 2: Load SentenceTransformer model
56
+ model = SentenceTransformer(args.bert_model_path)
57
 
58
+ # Step 3: Load FAISS job index
59
+ job_index = faiss.read_index(args.bert_faiss_index)
60
 
61
+ # Step 4: Encode resume into embedding
62
+ resume_embedding = model.encode([raw_resume], normalize_embeddings=True)
63
 
64
+ # Step 5: Search deeply in FAISS index in order to eliminate duplicate job titles
65
+ # Search across all job embeddings in FAISS
66
+ n_jobs = job_index.ntotal
67
+ D, I = job_index.search(resume_embedding, n_jobs)
68
 
69
+ # Step 6: Load job titles
70
+ job_df = load_job_titles(args.job_title_csv)
71
 
72
+ print(f"\n🎯 Top {args.top_k} Job Matches for the Resume (BERT):")
73
+ matches = top_n_bert_matches(I, D, job_df, top_n=args.top_k)
 
74
 
75
+ for idx, score in matches:
76
+ print(f"🔹 {job_df.iloc[idx]['title']} (score: {score:0.4f})")
77
+
78
+ # Optional debug
79
+ if args.debug:
80
+ print("\n======= DEBUG MODE =======")
81
+ print("\n--- Raw BERT/FAISS Similarity Scores (Top-K) ---")
82
+ for idx, score in matches:
83
+ print(f"🔹 {job_df.iloc[idx]['title']} (score: {score})")
84
+ print("============================")
85
+
86
+
87
+ def main(args):
88
+ try:
89
+ # Step 1: Load raw resume text
90
+ if not os.path.exists(args.resume_path):
91
+ raise FileNotFoundError(f"Resume file not found: {args.resume_path}")
92
+ raw_resume = extract_text_from_file(args.resume_path)
93
+
94
+ # Run chosen pipeline
95
+ if args.model == "bert":
96
+ run_bert_pipeline(args, raw_resume)
97
+ else:
98
+ run_tfidf_pipeline(args, raw_resume)
99
 
100
  except Exception as e:
101
  print(f"❌ Error: {str(e)}")
 
104
  if __name__ == "__main__":
105
  parser = argparse.ArgumentParser(description="Match a resume to top relevant job titles")
106
  parser.add_argument('--resume_path', type=str, required=True, help="Path to resume file")
107
+ parser.add_argument('--model', type=str, choices=['tfidf', 'bert'], default='tfidf',
108
+ help="Which model pipeline to use: 'tfidf' or 'bert'")
109
+
110
+
111
+ # TF-IDF arguments
112
+ parser.add_argument('--vectorizer_path', type=str, default='models/tfidf/app_tfidf/job_tfidf_vectorizer.pkl')
113
+ parser.add_argument('--job_matrix_path', type=str, default='models/tfidf/app_tfidf/job_tfidf_matrix.npz')
114
+
115
+ # BERT arguments
116
+ parser.add_argument('--bert_model_path', type=str, default='models/bert/dapt_minilm_sentence_transformer',
117
+ help="Path to fine-tuned SentenceTransformer model")
118
+ parser.add_argument('--bert_faiss_index', type=str, default='models/bert/app_bert/jobs_bert_embeddings.faiss',
119
+ help="Path to FAISS index of job embeddings")
120
+
121
+ # Shared arguments
122
  parser.add_argument('--job_title_csv', type=str, default='data/app_data/job_titles.csv')
123
+ parser.add_argument('--top_k', type=int, default=5,
124
+ help="Number of top job matches to return")
125
+ parser.add_argument('--debug', action='store_true',
126
+ help="Print raw similarity scores and cleaned resume for tfidf pipeline")
127
 
128
  args = parser.parse_args()
129
  main(args)
pipelines/recruiter_pipeline.py CHANGED
@@ -65,7 +65,7 @@ if __name__ == "__main__":
65
  parser = argparse.ArgumentParser(description="Recruiter Pipeline: Rank resumes for a given job description")
66
  parser.add_argument('--job_desc_path', type=str, required=True, help="Path to job description file")
67
  parser.add_argument('--resume_dir', type=str, required=True, help="Directory containing applicant resumes")
68
- parser.add_argument('--vectorizer_path', type=str, default='models/recruiter_tfidf/combined_tfidf_vectorizer.pkl')
69
  parser.add_argument('--top_k', type=int, default=10, help="Number of top resumes to return")
70
  parser.add_argument('--debug', action='store_true', help="Print cleaned job/resume text and raw matches")
71
 
 
65
  parser = argparse.ArgumentParser(description="Recruiter Pipeline: Rank resumes for a given job description")
66
  parser.add_argument('--job_desc_path', type=str, required=True, help="Path to job description file")
67
  parser.add_argument('--resume_dir', type=str, required=True, help="Directory containing applicant resumes")
68
+ parser.add_argument('--vectorizer_path', type=str, default='models/tfidf/recruiter_tfidf/combined_tfidf_vectorizer.pkl')
69
  parser.add_argument('--top_k', type=int, default=10, help="Number of top resumes to return")
70
  parser.add_argument('--debug', action='store_true', help="Print cleaned job/resume text and raw matches")
71
 
python ADDED
File without changes
src/feature_engg/bert_embedding_data.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import pandas as pd
4
+ import torch
5
+ from typing import Optional
6
+ from sentence_transformers import SentenceTransformer, models
7
+ import faiss
8
+
9
+
10
+ def get_bert_model(model_name: str = "all-MiniLM-L6-v2",
11
+ device: str = None):
12
+ """
13
+ Loads a BERT-based sentence transformer model for embeddings.
14
+
15
+ Args:
16
+ model_name (str): HuggingFace model name. Default is "all-MiniLM-L6-v2".
17
+ device (str, optional): "cuda", "cpu", or None (auto-detect).
18
+
19
+ Returns:
20
+ SentenceTransformer: Loaded model ready for encoding.
21
+ """
22
+ device = device or ("cuda" if torch.cuda.is_available() else "cpu")
23
+ return SentenceTransformer(model_name, device=device)
24
+
25
+
26
+ def save_bert_embeddings(embeddings: np.ndarray,
27
+ path: str):
28
+ """
29
+ Save dense BERT embeddings as a FAISS index file (.faiss).
30
+ """
31
+
32
+ if not path.endswith('.faiss'):
33
+ path += '.faiss'
34
+ os.makedirs(os.path.dirname(path), exist_ok=True)
35
+
36
+ embedding_dimension = embeddings.shape[1]
37
+ index = faiss.IndexFlatIP(embedding_dimension) # Inner Product (cosine if normalized)
38
+ index.add(embeddings)
39
+
40
+ faiss.write_index(index, path)
41
+ print(f"✅ BERT embeddings saved to FAISS index: [{path}] "
42
+ f"with {index.ntotal} vectors, dim={embedding_dimension}")
43
+
44
+
45
+ def save_bert_model(vectorizer: SentenceTransformer,
46
+ path: str):
47
+ """Save the full SentenceTransformer model to disk."""
48
+
49
+ os.makedirs(path, exist_ok=True)
50
+ vectorizer.save(path)
51
+ print(f"✅ BERT model saved to: [{path}]")
52
+
53
+
54
+ def bert_embed_text(df: pd.DataFrame,
55
+ text_column: str,
56
+ label: str,
57
+ model: Optional[SentenceTransformer] = None,
58
+ save_path: Optional[str] = None,
59
+ save_model_file: bool = False):
60
+ """
61
+ Encodes text from a DataFrame into dense BERT embeddings.
62
+
63
+ To save the embeddings and model, ensure 'save_path' is provided along with a valid 'label'.
64
+
65
+ Args:
66
+ df (pd.DataFrame): DataFrame containing the text to be encoded.
67
+ text_column (str): Column with text to be encoded.
68
+ label (str): Label prefix for saved files (e.g., 'resumes', 'jobs').
69
+ model (SentenceTransformer, optional): Preloaded model.
70
+ save_path (str, optional): Directory to save outputs.
71
+ save_model_file (bool): If True, also saves the model reference.
72
+
73
+ Returns:
74
+ tuple: (embeddings ndarray, model)
75
+ """
76
+
77
+ if df[text_column].isnull().any():
78
+ print(f"\n⚠️ Found missing values in column '{text_column}', replacing with empty string.")
79
+ df[text_column] = df[text_column].fillna("")
80
+
81
+ if model is None:
82
+ model = get_bert_model()
83
+
84
+ embeddings = model.encode(
85
+ df[text_column].tolist(),
86
+ convert_to_numpy=True,
87
+ show_progress_bar=True,
88
+ normalize_embeddings=True # normalizing as it is good for cosine similarity.
89
+ )
90
+
91
+ if save_path and label:
92
+ save_bert_embeddings(embeddings, os.path.join(save_path, f"{label}_bert_embeddings.faiss"))
93
+ if save_model_file:
94
+ save_bert_model(model, os.path.join(save_path, f"{label}_bert_model"))
95
+
96
+ return embeddings, model
97
+
98
+
99
+ def load_bert_embeddings(path: str):
100
+ """
101
+ Load a FAISS index file (.faiss) from disk.
102
+ """
103
+ if not path.endswith('.faiss'):
104
+ path += '.faiss'
105
+ return faiss.read_index(path)
106
+
107
+
108
+ def load_bert_model(path: str):
109
+ """Load a saved SentenceTransformer model."""
110
+
111
+ return SentenceTransformer(path)
112
+
113
+
114
+ def convert_hf_model_to_st(hf_model_path: str,
115
+ st_model_path: str):
116
+ """
117
+ Converts a HuggingFace model to a SentenceTransformer model.
118
+
119
+ Needed as fine-tuning was performed using HuggingFace's Transformers library.
120
+
121
+ Args:
122
+ hf_model_path (str): Path to the HuggingFace model.
123
+ st_model_path (str): Path to save the SentenceTransformer model.
124
+
125
+ Returns:
126
+ None: Saves the SentenceTransformer model to the specified path.
127
+ """
128
+ # Build SentenceTransformer from HF model
129
+ word_embedding_model = models.Transformer(hf_model_path)
130
+ pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
131
+ st_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
132
+
133
+ # Save to the provided path
134
+ os.makedirs(st_model_path, exist_ok=True)
135
+ st_model.save(st_model_path)
136
+ print(f"✅ Converted HuggingFace model [{hf_model_path}] "
137
+ f"to SentenceTransformer at [{st_model_path}]")
src/feature_engg/{vectorizing_data.py → tfidf_vectorizing_data.py} RENAMED
@@ -29,16 +29,16 @@ def get_combined_tfidf_vectorizer(max_features: int = 40000,
29
  return TfidfVectorizer(
30
  stop_words="english",
31
  lowercase=True,
32
- max_features=max_features, # Balanced for resumes + jobs
33
  ngram_range=ngram_range,
34
  min_df=5,
35
  max_df=0.85,
36
- sublinear_tf=True, # Smooth term frequency scaling
37
  norm="l2"
38
  )
39
 
40
  def save_vectorizer(vectorizer: TfidfVectorizer,
41
- path: str = 'models/dev_tfidf/tfidf_vectorizer.pkl'):
42
 
43
  """
44
  Saves a TfidfVectorizer object to a given path. Appends .pkl if missing.
@@ -64,9 +64,9 @@ def save_vector_data(matrix: csr_matrix, path: str):
64
  print(f"✅ TF-IDF matrix saved to: [{path}]")
65
 
66
 
67
- def vectorize_text(df: pd.DataFrame,
68
  text_column: str,
69
- label: str, # e.g., 'resumes' or 'jobs'
70
  vectorizer: Optional[TfidfVectorizer] = None,
71
  fit_vectorizer: bool = False,
72
  save_path: Optional[str] = None,
 
29
  return TfidfVectorizer(
30
  stop_words="english",
31
  lowercase=True,
32
+ max_features=max_features, # Balanced for resumes + jobs
33
  ngram_range=ngram_range,
34
  min_df=5,
35
  max_df=0.85,
36
+ sublinear_tf=True, # Smooth term frequency scaling
37
  norm="l2"
38
  )
39
 
40
  def save_vectorizer(vectorizer: TfidfVectorizer,
41
+ path: str = 'models/tfidf/dev_tfidf/tfidf_vectorizer.pkl'):
42
 
43
  """
44
  Saves a TfidfVectorizer object to a given path. Appends .pkl if missing.
 
64
  print(f"✅ TF-IDF matrix saved to: [{path}]")
65
 
66
 
67
+ def tfidf_vectorize_text(df: pd.DataFrame,
68
  text_column: str,
69
+ label: str,
70
  vectorizer: Optional[TfidfVectorizer] = None,
71
  fit_vectorizer: bool = False,
72
  save_path: Optional[str] = None,
src/fine_tuning/domain_adaptive_bert.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from datasets import Dataset
3
+ import argparse
4
+ from sklearn.model_selection import train_test_split
5
+ from transformers import (AutoTokenizer,
6
+ AutoModelForMaskedLM,
7
+ DataCollatorForLanguageModeling,
8
+ Trainer,
9
+ TrainingArguments,
10
+ EarlyStoppingCallback)
11
+
12
+
13
+
14
+ def run_dapt(corpus_path: str,
15
+ model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
16
+ output_dir: str = "models/bert/dapt_minilm",
17
+ num_train_epochs: int = 3,
18
+ per_device_train_batch_size: int = 32,
19
+ learning_rate: float = 5e-5,
20
+ warmup_steps: int = 0,
21
+ save_total_limit: int = 2,
22
+ logging_steps: int = 100,
23
+ max_seq_length: int = 256,
24
+ val_split: float = 0.1,
25
+ early_stopping_patience: int = 2,
26
+ early_stopping_threshold: float = 0.01,
27
+ save_best_only: bool = True):
28
+ """
29
+ Runs Domain-Adaptive Pretraining (DAPT) on a given text corpus.
30
+
31
+ Args:
32
+ corpus_path (str): Path to the text corpus file.
33
+ model_name (str): Name of the pre-trained BERT model to use. default: "sentence-transformers/all-MiniLM-L6-v2".
34
+ output_dir (str): Directory to save the trained model. default: "models/bert/dapt_minilm".
35
+ num_train_epochs (int): Number of training epochs. default: 3.
36
+ per_device_train_batch_size (int): Batch size for training. default: 32.
37
+ learning_rate (float): Learning rate for training. default: 5e-5.
38
+ warmup_steps (int): Number of warmup steps for training. default: 0.
39
+ save_total_limit (int): Number of checkpoints to save. default: 2.
40
+ logging_steps (int): Number of steps to log. default: 100.
41
+ max_seq_length (int): Maximum sequence length for input. default: 256.
42
+ val_split (float): Fraction of the data to use for validation. default: 0.1.
43
+ early_stopping_patience (int): Number of epochs to wait for improvement before early stopping. default: 2.
44
+ early_stopping_threshold (float): Threshold for early stopping improvement. default: 0.01.
45
+ save_best_only (bool): Whether to save only the best model. default: True.
46
+
47
+ Returns:
48
+ output_dir (str): Path to the trained model directory.
49
+ """
50
+
51
+ # Load dataset from text file bypassing any future caching errors.
52
+ with open(corpus_path, encoding="utf-8") as f:
53
+ lines = [l.strip() for l in f if l.strip()]
54
+
55
+ if val_split > 0:
56
+ # Train/validation split
57
+ train_texts, val_texts = train_test_split(lines, test_size=val_split, random_state=42)
58
+ dataset = Dataset.from_dict({"text": train_texts})
59
+ val_dataset = Dataset.from_dict({"text": val_texts})
60
+ else:
61
+ # Use full data for training
62
+ dataset = Dataset.from_dict({"text": lines})
63
+ val_dataset = None
64
+
65
+ # Tokenizer & model
66
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
67
+ model = AutoModelForMaskedLM.from_pretrained(model_name)
68
+
69
+ # Tokenization function
70
+ def tokenize_fn(batch):
71
+ return tokenizer(
72
+ batch["text"],
73
+ truncation=True,
74
+ padding="max_length",
75
+ max_length=max_seq_length,
76
+ )
77
+
78
+ tokenized_train = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
79
+ tokenized_val = val_dataset.map(tokenize_fn, batched=True, remove_columns=["text"]) if val_dataset else None
80
+
81
+ # Data collator with dynamic masking
82
+ data_collator = DataCollatorForLanguageModeling(
83
+ tokenizer=tokenizer,
84
+ mlm=True,
85
+ mlm_probability=0.15,
86
+ )
87
+
88
+ # Base training arguments
89
+ training_args = {
90
+ "output_dir": output_dir,
91
+ "overwrite_output_dir": True,
92
+ "num_train_epochs": num_train_epochs,
93
+ "per_device_train_batch_size": per_device_train_batch_size,
94
+ "save_total_limit": save_total_limit,
95
+ "prediction_loss_only": True,
96
+ "logging_steps": logging_steps,
97
+ "learning_rate": learning_rate,
98
+ "warmup_steps": warmup_steps,
99
+ "save_strategy": "epoch",
100
+ "report_to": "none",
101
+ }
102
+
103
+ # Add validation-related args only if val_split > 0
104
+ if val_dataset:
105
+ training_args.update({
106
+ "eval_strategy": "epoch",
107
+ "load_best_model_at_end": save_best_only,
108
+ "metric_for_best_model": "eval_loss",
109
+ "greater_is_better": False,
110
+ })
111
+
112
+
113
+ training_args = TrainingArguments(**training_args)
114
+
115
+ # Callbacks
116
+ callbacks = []
117
+ if val_dataset and early_stopping_patience > 0:
118
+ callbacks.append(
119
+ EarlyStoppingCallback(
120
+ early_stopping_patience=early_stopping_patience,
121
+ early_stopping_threshold=early_stopping_threshold,
122
+ )
123
+ )
124
+
125
+ # Trainer
126
+ trainer = Trainer(
127
+ model=model,
128
+ args=training_args,
129
+ train_dataset=tokenized_train,
130
+ eval_dataset=tokenized_val if val_dataset else None,
131
+ tokenizer=tokenizer,
132
+ data_collator=data_collator,
133
+ callbacks=callbacks if val_dataset else None,
134
+ )
135
+
136
+ # Train
137
+ print("🚀 Starting Domain-Adaptive Pretraining (DAPT)...")
138
+ trainer.train()
139
+ trainer.save_model(output_dir)
140
+ tokenizer.save_pretrained(output_dir)
141
+
142
+ print(f"✅ DAPT finished! Model saved at: {output_dir}")
143
+ return output_dir
144
+
145
+
146
+ def main():
147
+ parser = argparse.ArgumentParser(description="Domain-Adaptive Pretraining (DAPT) for BERT/SBERT")
148
+
149
+ parser.add_argument("--model_name", type=str, default="sentence-transformers/all-MiniLM-L6-v2",
150
+ help="Pretrained model name or path to load")
151
+ parser.add_argument("--corpus_path", type=str, default="data/processed/domain_corpus.txt",
152
+ help="Path to plain text corpus for DAPT")
153
+ parser.add_argument("--output_dir", type=str, default="models/dapt_bert",
154
+ help="Directory to save the fine-tuned model")
155
+ parser.add_argument("--epochs", type=int, default=3,
156
+ help="Number of training epochs")
157
+ parser.add_argument("--batch_size", type=int, default=32,
158
+ help="Training batch size per device")
159
+ parser.add_argument("--learning_rate", type=float, default=5e-5,
160
+ help="Learning rate for AdamW optimizer")
161
+ parser.add_argument("--warmup_steps", type=int, default=0,
162
+ help="Number of warmup steps for LR scheduler")
163
+ parser.add_argument("--max_seq_length", type=int, default=256,
164
+ help="Maximum sequence length for inputs")
165
+ parser.add_argument("--val_split", type=float, default=0.1,
166
+ help="Fraction of data to use for validation (set 0 for no validation)")
167
+ parser.add_argument("--early_stopping_patience", type=int, default=2,
168
+ help="Number of evals with no improvement before stopping (ignored if val_split=0)")
169
+ parser.add_argument("--early_stopping_threshold", type=float, default=0.01,
170
+ help="Minimum improvement in eval loss to be considered progress (ignored if val_split=0)")
171
+ parser.add_argument("--save_best_only", action="store_true",
172
+ help="Save only the best checkpoint (ignored if val_split=0)")
173
+
174
+ args = parser.parse_args()
175
+ Path(args.output_dir).mkdir(parents=True, exist_ok=True)
176
+
177
+ run_dapt(
178
+ model_name=args.model_name,
179
+ corpus_path=args.corpus_path,
180
+ output_dir=args.output_dir,
181
+ num_train_epochs=args.epochs,
182
+ per_device_train_batch_size=args.batch_size,
183
+ learning_rate=args.learning_rate,
184
+ warmup_steps=args.warmup_steps,
185
+ max_seq_length=args.max_seq_length,
186
+ val_split=args.val_split,
187
+ early_stopping_patience=args.early_stopping_patience,
188
+ early_stopping_threshold=args.early_stopping_threshold,
189
+ save_best_only=args.save_best_only,
190
+ )
191
+
192
+
193
+ if __name__ == "__main__":
194
+ main()
src/matching/matching_engine.py CHANGED
@@ -1,6 +1,6 @@
1
  import numpy as np
2
  from sklearn.metrics.pairwise import cosine_similarity
3
- from src.feature_engg.vectorizing_data import load_vector_data
4
 
5
  def compute_similarity_matrix(X_resumes, X_jobs ):
6
  """
@@ -9,7 +9,7 @@ def compute_similarity_matrix(X_resumes, X_jobs ):
9
  """
10
  return cosine_similarity(X_resumes, X_jobs)
11
 
12
- def top_n_matches(similarity_matrix: np.ndarray,
13
  top_n: int = 5,
14
  job_df = None):
15
  """
@@ -36,6 +36,34 @@ def top_n_matches(similarity_matrix: np.ndarray,
36
  results[i] = ranked
37
  return results
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  if __name__ == "__main__":
40
  # Define paths
41
  resume_vec_path = "models/dev_tfidf/resumes_tfidf_matrix.npz"
@@ -56,7 +84,7 @@ if __name__ == "__main__":
56
  # print(f"Min score: {np.min(all_scores):0.4f}, \nMax score: {np.max(all_scores):0.4f}, \nMean score: {np.mean(all_scores):0.4f}, \nMedian score: {np.median(all_scores):0.4f}")
57
 
58
  # Get top 5 matches per resume
59
- matches = top_n_matches(similarity_matrix, top_n=5)
60
 
61
  # Display example output (i.e. top_n job matches for first 5 resumes)
62
  for resume_idx, top_jobs in list(matches.items())[:5]:
 
1
  import numpy as np
2
  from sklearn.metrics.pairwise import cosine_similarity
3
+ from src.feature_engg.tfidf_vectorizing_data import load_vector_data
4
 
5
  def compute_similarity_matrix(X_resumes, X_jobs ):
6
  """
 
9
  """
10
  return cosine_similarity(X_resumes, X_jobs)
11
 
12
+ def top_n_tfidf_matches(similarity_matrix: np.ndarray,
13
  top_n: int = 5,
14
  job_df = None):
15
  """
 
36
  results[i] = ranked
37
  return results
38
 
39
+ def top_n_bert_matches(indices, distances, job_df, top_n=5):
40
+ """
41
+ Deduplicate FAISS results by job title and return top-N unique matches.
42
+ Searches across all jobs if provided.
43
+
44
+ Args:
45
+ indices (np.ndarray): Indices of nearest neighbors from FAISS (shape: [1, k]).
46
+ distances (np.ndarray): Distances/similarities from FAISS (shape: [1, k]).
47
+ job_df (pd.DataFrame): DataFrame containing job titles.
48
+ top_n (int): Number of unique top matches to return.
49
+
50
+ Returns:
51
+ List[Tuple[int, float]]: List of (job_idx, score) for top-N unique titles.
52
+ """
53
+ seen_titles = set()
54
+ ranked = []
55
+
56
+ for idx, score in zip(indices[0], distances[0]):
57
+ title = job_df.iloc[idx]['title']
58
+ if title not in seen_titles:
59
+ ranked.append((idx, float(score)))
60
+ seen_titles.add(title)
61
+ if len(ranked) == top_n:
62
+ break
63
+
64
+ return ranked
65
+
66
+
67
  if __name__ == "__main__":
68
  # Define paths
69
  resume_vec_path = "models/dev_tfidf/resumes_tfidf_matrix.npz"
 
84
  # print(f"Min score: {np.min(all_scores):0.4f}, \nMax score: {np.max(all_scores):0.4f}, \nMean score: {np.mean(all_scores):0.4f}, \nMedian score: {np.median(all_scores):0.4f}")
85
 
86
  # Get top 5 matches per resume
87
+ matches = top_n_tfidf_matches(similarity_matrix, top_n=5)
88
 
89
  # Display example output (i.e. top_n job matches for first 5 resumes)
90
  for resume_idx, top_jobs in list(matches.items())[:5]: