Om-Shandilya commited on
Commit
f6d8deb
Β·
1 Parent(s): 65281cf

Pipelines with local model use

Browse files
pipelines/app_pipeline.py CHANGED
@@ -43,12 +43,12 @@ def run_tfidf_pipeline(args, raw_resume: str):
43
 
44
  # Optional debug
45
  if args.debug:
46
- print("\n======= DEBUG MODE =======")
47
- print("\nπŸ“„ Cleaned Resume Preview:\n", cleaned_resume[:1000])
48
- print("\n--- Raw TF-IDF Similarity Scores (Top-K) ---")
49
  for job_idx, score in matches[0]:
50
  print(f"[{job_idx}] {job_df.iloc[job_idx]['title']} β†’ {score:0.6f}")
51
- print("============================")
52
 
53
 
54
  def run_bert_pipeline(args, raw_resume: str):
@@ -77,11 +77,11 @@ def run_bert_pipeline(args, raw_resume: str):
77
 
78
  # Optional debug
79
  if args.debug:
80
- print("\n======= DEBUG MODE =======")
81
- print("\n--- Raw BERT/FAISS Similarity Scores (Top-K) ---")
82
  for idx, score in matches:
83
  print(f"πŸ”Ή {job_df.iloc[idx]['title']} (score: {score})")
84
- print("============================")
85
 
86
 
87
  def main(args):
 
43
 
44
  # Optional debug
45
  if args.debug:
46
+ print("\n================ DEBUG MODE ================")
47
+ print("\nπŸ“„--- [DEBUG - TFIDF] Cleaned Resume Preview:\n", cleaned_resume[:1000], "---")
48
+ print(f"\n--- [DEBUG - TFIDF] Raw Similarity Scores (top {args.top_k}) ---")
49
  for job_idx, score in matches[0]:
50
  print(f"[{job_idx}] {job_df.iloc[job_idx]['title']} β†’ {score:0.6f}")
51
+ print("==============================================")
52
 
53
 
54
  def run_bert_pipeline(args, raw_resume: str):
 
77
 
78
  # Optional debug
79
  if args.debug:
80
+ print("\n================ DEBUG MODE ================")
81
+ print(f"\n--- [DEBUG - BERT/FAISS] Raw Similarity Scores (top {args.top_k}) ---")
82
  for idx, score in matches:
83
  print(f"πŸ”Ή {job_df.iloc[idx]['title']} (score: {score})")
84
+ print("==============================================")
85
 
86
 
87
  def main(args):
pipelines/dev_pipeline.py CHANGED
@@ -37,9 +37,9 @@ if __name__ == "__main__":
37
 
38
 
39
  # Step 4: Vectorize using shared TF-IDF vectorizer
40
- from src.feature_engg.vectorizing_data import (
41
- get_tfidf_vectorizer, vectorize_text, save_vectorizer, save_vector_data
42
- )
43
 
44
  print("\nπŸ’» Vectorizing text using shared TF-IDF vectorizer...")
45
 
@@ -54,13 +54,13 @@ vector_save_dir = "models/dev_tfidf"
54
  os.makedirs(vector_save_dir, exist_ok=True)
55
 
56
  # Transform resumes and jobs separately using the same vectorizer
57
- X_resumes, _ = vectorize_text(
58
  df_resumes, text_column="text_cleaned", label="resumes",
59
  vectorizer=shared_vectorizer, fit_vectorizer=False,
60
  save_path=vector_save_dir, save_vectorizer_file=False # We’ll save manually below
61
  )
62
 
63
- X_jobs, _ = vectorize_text(
64
  df_jobs, text_column="text_cleaned", label="jobs",
65
  vectorizer=shared_vectorizer, fit_vectorizer=False,
66
  save_path=vector_save_dir, save_vectorizer_file=False
 
37
 
38
 
39
  # Step 4: Vectorize using shared TF-IDF vectorizer
40
+ from src.feature_engg.tfidf_vectorizing_data import (get_tfidf_vectorizer,
41
+ tfidf_vectorize_text,
42
+ save_vectorizer)
43
 
44
  print("\nπŸ’» Vectorizing text using shared TF-IDF vectorizer...")
45
 
 
54
  os.makedirs(vector_save_dir, exist_ok=True)
55
 
56
  # Transform resumes and jobs separately using the same vectorizer
57
+ X_resumes, _ = tfidf_vectorize_text(
58
  df_resumes, text_column="text_cleaned", label="resumes",
59
  vectorizer=shared_vectorizer, fit_vectorizer=False,
60
  save_path=vector_save_dir, save_vectorizer_file=False # We’ll save manually below
61
  )
62
 
63
+ X_jobs, _ = tfidf_vectorize_text(
64
  df_jobs, text_column="text_cleaned", label="jobs",
65
  vectorizer=shared_vectorizer, fit_vectorizer=False,
66
  save_path=vector_save_dir, save_vectorizer_file=False
pipelines/recruiter_pipeline.py CHANGED
@@ -2,72 +2,138 @@ import argparse
2
  import os
3
  import joblib
4
  from sklearn.metrics.pairwise import cosine_similarity
5
- from src.utils.bulk_loading import bulk_load_cleaned_resume_files
 
 
6
  from src.utils.file_reader import extract_text_from_file
7
  from src.processing.text_cleaning import clean_text
8
 
9
 
10
- def main(args):
11
- try:
12
- # Step 1: Load vectorizer
13
- if not os.path.exists(args.vectorizer_path):
14
- raise FileNotFoundError(f"⚠️ Vectorizer file not found: {args.vectorizer_path}")
15
- vectorizer = joblib.load(args.vectorizer_path)
16
 
 
 
 
17
 
18
- # Step 2: Process job description
19
- if not os.path.exists(args.job_desc_path):
20
- raise FileNotFoundError(f"⚠️ Job description file not found: {args.job_desc_path}")
21
-
22
- raw_job_text = extract_text_from_file(args.job_desc_path)
23
- cleaned_job_text = clean_text(raw_job_text)
24
- job_vector = vectorizer.transform([cleaned_job_text])
25
 
 
 
26
 
27
- # Step 3: Process applicant resumes
28
- if not os.path.isdir(args.resume_dir):
29
- raise NotADirectoryError(f"⚠️ Resume directory not found: {args.resume_dir}")
30
-
31
- resume_texts = bulk_load_cleaned_resume_files(args.resume_dir) # dict: {filename: cleaned_text}
32
-
33
- if not resume_texts:
34
- raise ValueError("⚠️ No valid resumes found in the given directory.")
 
 
35
 
36
- resume_matrix = vectorizer.transform(resume_texts.values())
 
 
37
 
38
 
39
- # Step 4: Compute similarity
40
- sims = cosine_similarity(job_vector, resume_matrix)[0]
 
 
41
 
 
 
42
 
43
- # Step 5: Rank resumes
44
- ranked = sorted(zip(resume_texts.keys(), sims), key=lambda x: x[1], reverse=True)
45
- top_k = min(args.top_k, len(ranked))
46
 
 
 
47
 
48
- # Step 6: Print the output
49
- print(f"\n🎯 Top {top_k} Candidate Matches for the Job:")
50
- for i, (fname, score) in enumerate(ranked[:top_k], 1):
51
- print(f"{i}. {fname} β†’ score: {score:.4f}")
52
 
53
- # Optional debug output
54
- if args.debug:
55
- print("\n===== DEBUG MODE =====")
56
- print("\nπŸ“„ Cleaned Job Description Preview:\n", cleaned_job_text[:1000])
57
- print("\nπŸ“Š Raw Similarity Scores:\n", sims)
58
- print("=======================")
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  except Exception as e:
61
  print(f"❌ Error: {str(e)}")
62
 
63
 
64
  if __name__ == "__main__":
65
- parser = argparse.ArgumentParser(description="Recruiter Pipeline: Rank resumes for a given job description")
 
 
66
  parser.add_argument('--job_desc_path', type=str, required=True, help="Path to job description file")
67
  parser.add_argument('--resume_dir', type=str, required=True, help="Directory containing applicant resumes")
68
- parser.add_argument('--vectorizer_path', type=str, default='models/tfidf/recruiter_tfidf/combined_tfidf_vectorizer.pkl')
 
69
  parser.add_argument('--top_k', type=int, default=10, help="Number of top resumes to return")
70
- parser.add_argument('--debug', action='store_true', help="Print cleaned job/resume text and raw matches")
 
 
 
 
 
 
 
 
 
 
71
 
72
  args = parser.parse_args()
73
  main(args)
 
2
  import os
3
  import joblib
4
  from sklearn.metrics.pairwise import cosine_similarity
5
+ import faiss
6
+ from sentence_transformers import SentenceTransformer
7
+ from src.utils.bulk_loading import bulk_load_raw_resume_files
8
  from src.utils.file_reader import extract_text_from_file
9
  from src.processing.text_cleaning import clean_text
10
 
11
 
12
+ def rank_with_tfidf(args, raw_job_text, raw_resume_texts):
13
+ """TF-IDF recruiter pipeline"""
14
+ # Step 1: Load vectorizer
15
+ if not os.path.exists(args.vectorizer_path):
16
+ raise FileNotFoundError(f"⚠️ Vectorizer file not found: {args.vectorizer_path}")
17
+ vectorizer = joblib.load(args.vectorizer_path)
18
 
19
+ # Step 2: Process job description
20
+ cleaned_job_text = clean_text(raw_job_text)
21
+ job_vector = vectorizer.transform([cleaned_job_text])
22
 
23
+ # Step 3: Process resumes
24
+ cleaned_resumes = {fname: clean_text(txt) for fname, txt in raw_resume_texts.items()}
25
+ resume_matrix = vectorizer.transform(cleaned_resumes.values())
 
 
 
 
26
 
27
+ # Step 4: Compute similarity
28
+ sims = cosine_similarity(job_vector, resume_matrix)[0]
29
 
30
+ if args.debug:
31
+ print("\n================ DEBUG MODE ================")
32
+ print("\n[DEBUG - TFIDF] Cleaned job description:")
33
+ print(cleaned_job_text[:500], "...\n")
34
+ print("[DEBUG - TFIDF] First 3 cleaned resumes:")
35
+ for i, (fname, txt) in enumerate(cleaned_resumes.items()):
36
+ if i >= 3: break
37
+ print(f"{fname}: {txt[:300]}...\n")
38
+ print("[DEBUG - TFIDF] Raw similarity scores:", sims[:10])
39
+ print("==============================================")
40
 
41
+ # Step 5: Rank resumes
42
+ ranked = sorted(zip(cleaned_resumes.keys(), sims), key=lambda x: x[1], reverse=True)
43
+ return ranked
44
 
45
 
46
+ def rank_with_bert(args, raw_job_text, raw_resume_texts):
47
+ """BERT recruiter pipeline using FAISS (on the fly)"""
48
+ if not os.path.exists(args.bert_model_path):
49
+ raise FileNotFoundError(f"⚠️ BERT model not found: {args.bert_model_path}")
50
 
51
+ # Step 1: Load BERT model
52
+ model = SentenceTransformer(args.bert_model_path)
53
 
54
+ # Step 2: Encode job description
55
+ job_embedding = model.encode([raw_job_text], convert_to_numpy=True, normalize_embeddings=True)
 
56
 
57
+ # Step 3: Encode resumes
58
+ resume_embeddings = model.encode(list(raw_resume_texts.values()), convert_to_numpy=True, normalize_embeddings=True)
59
 
60
+ # Step 4: Create FAISS indices
61
+ local_index = faiss.IndexFlatIP(resume_embeddings.shape[1])
62
+ local_index.add(resume_embeddings)
 
63
 
64
+ scores, indices = local_index.search(job_embedding, len(raw_resume_texts))
 
 
 
 
 
65
 
66
+ if args.debug:
67
+ print("\n================ DEBUG MODE ================")
68
+ print("\n[DEBUG - BERT/FAISS] Raw job description:")
69
+ print(raw_job_text[:500], "...\n")
70
+ print("[DEBUG - BERT/FAISS] First 3 raw resumes:")
71
+ for i, (fname, txt) in enumerate(raw_resume_texts.items()):
72
+ if i >= 3: break
73
+ print(f"{fname}: {txt[:300]}...\n")
74
+ print(f"[DEBUG - BERT/FAISS] all similarity scores:", scores[0][:len(raw_resume_texts)])
75
+ print("==============================================")
76
+
77
+ # Step 5: Rank resumes
78
+ ranked = [(list(raw_resume_texts.keys())[i], float(scores[0][j]))
79
+ for j, i in enumerate(indices[0])]
80
+ return ranked
81
+
82
+
83
+ def main(args):
84
+ try:
85
+ # Load raw job and resumes
86
+ raw_job_text = extract_text_from_file(args.job_desc_path)
87
+ raw_resume_texts = bulk_load_raw_resume_files(args.resume_dir)
88
+
89
+ if not raw_resume_texts:
90
+ raise ValueError("⚠️ No valid resumes found in the given directory.")
91
+
92
+ # Limit the number of resumes displayed based on the top_k argument and available resumes
93
+ available_resumes = len(raw_resume_texts)
94
+ top_k = min(args.top_k, available_resumes)
95
+
96
+ if args.top_k > available_resumes:
97
+ print(f"\n⚠️ Only {available_resumes} resumes are available. "
98
+ f"Showing top {available_resumes} matches instead of {args.top_k}.\n")
99
+
100
+ # Choose model
101
+ if args.model == "tfidf":
102
+ ranked = rank_with_tfidf(args, raw_job_text, raw_resume_texts)
103
+ elif args.model == "bert":
104
+ ranked = rank_with_bert(args, raw_job_text, raw_resume_texts)
105
+ else:
106
+ raise ValueError("❌ Invalid model. Choose 'tfidf' or 'bert'.")
107
+
108
+ # Display ranked resumes
109
+ print(f"\n🎯 Top {top_k} Candidate Matches for the Job ({args.model.upper()}):")
110
+ for i, (fname, score) in enumerate(ranked[:top_k], 1):
111
+ print(f"{i}. {fname} β†’ score: {score:.4f}")
112
+
113
  except Exception as e:
114
  print(f"❌ Error: {str(e)}")
115
 
116
 
117
  if __name__ == "__main__":
118
+ parser = argparse.ArgumentParser(description="Recruiter Pipeline: Rank user uploaded resumes for a given job description")
119
+
120
+ # Shared arguments
121
  parser.add_argument('--job_desc_path', type=str, required=True, help="Path to job description file")
122
  parser.add_argument('--resume_dir', type=str, required=True, help="Directory containing applicant resumes")
123
+ parser.add_argument('--model', type=str, choices=['tfidf', 'bert'], default='tfidf',
124
+ help="Model to use: tfidf or bert")
125
  parser.add_argument('--top_k', type=int, default=10, help="Number of top resumes to return")
126
+ parser.add_argument('--debug', action='store_true', help="Print cleaned/raw texts and raw similarity scores")
127
+
128
+ # TF-IDF specific
129
+ parser.add_argument('--vectorizer_path', type=str,
130
+ default='models/tfidf/recruiter_tfidf/combined_tfidf_vectorizer.pkl',
131
+ help="Path to pre-trained TF-IDF vectorizer")
132
+
133
+ # BERT specific
134
+ parser.add_argument('--bert_model_path', type=str,
135
+ default='models/bert/dapt_minilm_sentence_transformer',
136
+ help="Path to fine-tuned BERT/SBERT model")
137
 
138
  args = parser.parse_args()
139
  main(args)
src/feature_engg/tfidf_vectorizing_data.py CHANGED
@@ -27,14 +27,14 @@ def get_combined_tfidf_vectorizer(max_features: int = 40000,
27
  Creates a TF-IDF vectorizer with specified parameters for larger vocab with both Jobs and Resume.
28
  """
29
  return TfidfVectorizer(
30
- stop_words="english",
31
- lowercase=True,
32
  max_features=max_features, # Balanced for resumes + jobs
33
- ngram_range=ngram_range,
34
- min_df=5,
35
- max_df=0.85,
36
  sublinear_tf=True, # Smooth term frequency scaling
37
- norm="l2"
38
  )
39
 
40
  def save_vectorizer(vectorizer: TfidfVectorizer,
 
27
  Creates a TF-IDF vectorizer with specified parameters for larger vocab with both Jobs and Resume.
28
  """
29
  return TfidfVectorizer(
30
+ stop_words="english", # Remove common English stopwords
31
+ lowercase=True, # Convert all to lowercase
32
  max_features=max_features, # Balanced for resumes + jobs
33
+ ngram_range=ngram_range, # By default Unigrams + Bigrams
34
+ min_df=5, # Ignore very rare words
35
+ max_df=0.85, # Ignore very common words
36
  sublinear_tf=True, # Smooth term frequency scaling
37
+ norm="l2" # Normalize for cosine similarity
38
  )
39
 
40
  def save_vectorizer(vectorizer: TfidfVectorizer,
src/utils/bulk_loading.py CHANGED
@@ -1,21 +1,20 @@
1
  import os
2
- from typing import List, Dict, Union
3
-
4
- from src.processing.text_cleaning import clean_text # assuming you already have this
5
  from src.utils.file_reader import extract_text_from_pdf, extract_text_from_docx, extract_text_from_txt
6
 
7
- def bulk_load_cleaned_resume_files(input_path: Union[str, List[str]]) -> Dict[str, str]:
8
  """
9
- Load multiple resumes from a directory or list of files.
10
 
11
  Args:
12
  input_path : str or List[str]
13
  Either:
14
  - A path to a directory containing resume files, OR
15
- - A list of individual file paths.
 
16
 
17
  Returns:
18
- Dict[str, str]: Dictionary mapping file's basenames -> cleaned text.
19
  """
20
  resumes = {}
21
 
@@ -26,9 +25,11 @@ def bulk_load_cleaned_resume_files(input_path: Union[str, List[str]]) -> Dict[st
26
  for f in os.listdir(input_path)
27
  if f.lower().endswith((".pdf", ".docx", ".txt"))
28
  ]
 
29
  # Case 2: list of files
30
  elif isinstance(input_path, list):
31
  file_paths = input_path
 
32
  # Case 3: single file
33
  elif isinstance(input_path, str) and os.path.isfile(input_path):
34
  file_paths = [input_path]
@@ -49,13 +50,12 @@ def bulk_load_cleaned_resume_files(input_path: Union[str, List[str]]) -> Dict[st
49
  else:
50
  print(f"⚠️ Skipping unsupported file type: {path}")
51
  continue
52
-
53
- cleaned = clean_text(text)
54
- resumes[os.path.basename(path)] = cleaned
55
 
56
  except Exception as e:
57
  print(f"❌ Error processing {path}: {e}")
58
 
59
  print(f"βœ… Loaded {len(resumes)} resumes.")
60
-
61
  return resumes
 
1
  import os
2
+ from typing import List, Union
 
 
3
  from src.utils.file_reader import extract_text_from_pdf, extract_text_from_docx, extract_text_from_txt
4
 
5
+ def bulk_load_raw_resume_files(input_path: Union[str, List[str]]):
6
  """
7
+ Load multiple resumes from a directory, a list of files, or a single file.
8
 
9
  Args:
10
  input_path : str or List[str]
11
  Either:
12
  - A path to a directory containing resume files, OR
13
+ - A list of individual file paths, OR
14
+ - A single file path.
15
 
16
  Returns:
17
+ Dict[str, str]: Dictionary mapping file's basenames -> raw text.
18
  """
19
  resumes = {}
20
 
 
25
  for f in os.listdir(input_path)
26
  if f.lower().endswith((".pdf", ".docx", ".txt"))
27
  ]
28
+
29
  # Case 2: list of files
30
  elif isinstance(input_path, list):
31
  file_paths = input_path
32
+
33
  # Case 3: single file
34
  elif isinstance(input_path, str) and os.path.isfile(input_path):
35
  file_paths = [input_path]
 
50
  else:
51
  print(f"⚠️ Skipping unsupported file type: {path}")
52
  continue
53
+
54
+ # βœ… Add to dictionary
55
+ resumes[os.path.basename(path)] = text
56
 
57
  except Exception as e:
58
  print(f"❌ Error processing {path}: {e}")
59
 
60
  print(f"βœ… Loaded {len(resumes)} resumes.")
 
61
  return resumes