Spaces:

Om-Shandilya
/

resume-matcher-app

Running

App Files Files Community

Om-Shandilya commited on 17 days ago

Commit

f6d8deb

1 Parent(s): 65281cf

Pipelines with local model use

Browse files

Files changed (5) hide show

pipelines/app_pipeline.py +7 -7
pipelines/dev_pipeline.py +5 -5
pipelines/recruiter_pipeline.py +107 -41
src/feature_engg/tfidf_vectorizing_data.py +6 -6
src/utils/bulk_loading.py +11 -11

pipelines/app_pipeline.py CHANGED Viewed

@@ -43,12 +43,12 @@ def run_tfidf_pipeline(args, raw_resume: str):
     # Optional debug
     if args.debug:
-        print("\n======= DEBUG MODE =======")
-        print("\n📄 Cleaned Resume Preview:\n", cleaned_resume[:1000])
-        print("\n--- Raw TF-IDF Similarity Scores (Top-K) ---")
         for job_idx, score in matches[0]:
             print(f"[{job_idx}] {job_df.iloc[job_idx]['title']} → {score:0.6f}")
-        print("============================")
 def run_bert_pipeline(args, raw_resume: str):
@@ -77,11 +77,11 @@ def run_bert_pipeline(args, raw_resume: str):
     # Optional debug
     if args.debug:
-        print("\n======= DEBUG MODE =======")
-        print("\n--- Raw BERT/FAISS Similarity Scores (Top-K) ---")
         for idx, score in matches:
             print(f"🔹 {job_df.iloc[idx]['title']} (score: {score})")
-        print("============================")
 def main(args):

     # Optional debug
     if args.debug:
+        print("\n================ DEBUG MODE ================")
+        print("\n📄--- [DEBUG - TFIDF] Cleaned Resume Preview:\n", cleaned_resume[:1000], "---")
+        print(f"\n--- [DEBUG - TFIDF] Raw Similarity Scores (top {args.top_k}) ---")
         for job_idx, score in matches[0]:
             print(f"[{job_idx}] {job_df.iloc[job_idx]['title']} → {score:0.6f}")
+        print("==============================================")
 def run_bert_pipeline(args, raw_resume: str):
     # Optional debug
     if args.debug:
+        print("\n================ DEBUG MODE ================")
+        print(f"\n--- [DEBUG - BERT/FAISS] Raw Similarity Scores (top {args.top_k}) ---")
         for idx, score in matches:
             print(f"🔹 {job_df.iloc[idx]['title']} (score: {score})")
+        print("==============================================")
 def main(args):

pipelines/dev_pipeline.py CHANGED Viewed

@@ -37,9 +37,9 @@ if __name__ == "__main__":
 # Step 4: Vectorize using shared TF-IDF vectorizer
-from src.feature_engg.vectorizing_data import (
-    get_tfidf_vectorizer, vectorize_text, save_vectorizer, save_vector_data
-)
 print("\n💻 Vectorizing text using shared TF-IDF vectorizer...")
@@ -54,13 +54,13 @@ vector_save_dir = "models/dev_tfidf"
 os.makedirs(vector_save_dir, exist_ok=True)
 # Transform resumes and jobs separately using the same vectorizer
-X_resumes, _ = vectorize_text(
     df_resumes, text_column="text_cleaned", label="resumes",
     vectorizer=shared_vectorizer, fit_vectorizer=False,
     save_path=vector_save_dir, save_vectorizer_file=False  # We’ll save manually below
 )
-X_jobs, _ = vectorize_text(
     df_jobs, text_column="text_cleaned", label="jobs",
     vectorizer=shared_vectorizer, fit_vectorizer=False,
     save_path=vector_save_dir, save_vectorizer_file=False

 # Step 4: Vectorize using shared TF-IDF vectorizer
+from src.feature_engg.tfidf_vectorizing_data import (get_tfidf_vectorizer,
+                                                     tfidf_vectorize_text,
+                                                     save_vectorizer)
 print("\n💻 Vectorizing text using shared TF-IDF vectorizer...")
 os.makedirs(vector_save_dir, exist_ok=True)
 # Transform resumes and jobs separately using the same vectorizer
+X_resumes, _ = tfidf_vectorize_text(
     df_resumes, text_column="text_cleaned", label="resumes",
     vectorizer=shared_vectorizer, fit_vectorizer=False,
     save_path=vector_save_dir, save_vectorizer_file=False  # We’ll save manually below
 )
+X_jobs, _ = tfidf_vectorize_text(
     df_jobs, text_column="text_cleaned", label="jobs",
     vectorizer=shared_vectorizer, fit_vectorizer=False,
     save_path=vector_save_dir, save_vectorizer_file=False

pipelines/recruiter_pipeline.py CHANGED Viewed

@@ -2,72 +2,138 @@ import argparse
 import os
 import joblib
 from sklearn.metrics.pairwise import cosine_similarity
-from src.utils.bulk_loading import bulk_load_cleaned_resume_files
 from src.utils.file_reader import extract_text_from_file
 from src.processing.text_cleaning import clean_text
-def main(args):
-    try:
-        # Step 1: Load vectorizer
-        if not os.path.exists(args.vectorizer_path):
-            raise FileNotFoundError(f"⚠️ Vectorizer file not found: {args.vectorizer_path}")
-        vectorizer = joblib.load(args.vectorizer_path)
-        # Step 2: Process job description
-        if not os.path.exists(args.job_desc_path):
-            raise FileNotFoundError(f"⚠️ Job description file not found: {args.job_desc_path}")
-        raw_job_text = extract_text_from_file(args.job_desc_path)
-        cleaned_job_text = clean_text(raw_job_text)
-        job_vector = vectorizer.transform([cleaned_job_text])
-        # Step 3: Process applicant resumes
-        if not os.path.isdir(args.resume_dir):
-            raise NotADirectoryError(f"⚠️ Resume directory not found: {args.resume_dir}")
-        resume_texts = bulk_load_cleaned_resume_files(args.resume_dir)  # dict: {filename: cleaned_text}
-        if not resume_texts:
-            raise ValueError("⚠️ No valid resumes found in the given directory.")
-        resume_matrix = vectorizer.transform(resume_texts.values())
-        # Step 4: Compute similarity
-        sims = cosine_similarity(job_vector, resume_matrix)[0]
-        # Step 5: Rank resumes
-        ranked = sorted(zip(resume_texts.keys(), sims), key=lambda x: x[1], reverse=True)
-        top_k = min(args.top_k, len(ranked))
-        # Step 6: Print the output
-        print(f"\n🎯 Top {top_k} Candidate Matches for the Job:")
-        for i, (fname, score) in enumerate(ranked[:top_k], 1):
-            print(f"{i}. {fname}  → score: {score:.4f}")
-        # Optional debug output
-        if args.debug:
-            print("\n===== DEBUG MODE =====")
-            print("\n📄 Cleaned Job Description Preview:\n", cleaned_job_text[:1000])
-            print("\n📊 Raw Similarity Scores:\n", sims)
-            print("=======================")
     except Exception as e:
         print(f"❌ Error: {str(e)}")
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Recruiter Pipeline: Rank resumes for a given job description")
     parser.add_argument('--job_desc_path', type=str, required=True, help="Path to job description file")
     parser.add_argument('--resume_dir', type=str, required=True, help="Directory containing applicant resumes")
-    parser.add_argument('--vectorizer_path', type=str, default='models/tfidf/recruiter_tfidf/combined_tfidf_vectorizer.pkl')
     parser.add_argument('--top_k', type=int, default=10, help="Number of top resumes to return")
-    parser.add_argument('--debug', action='store_true', help="Print cleaned job/resume text and raw matches")
     args = parser.parse_args()
     main(args)

 import os
 import joblib
 from sklearn.metrics.pairwise import cosine_similarity
+import faiss
+from sentence_transformers import SentenceTransformer
+from src.utils.bulk_loading import bulk_load_raw_resume_files
 from src.utils.file_reader import extract_text_from_file
 from src.processing.text_cleaning import clean_text
+def rank_with_tfidf(args, raw_job_text, raw_resume_texts):
+    """TF-IDF recruiter pipeline"""
+    # Step 1: Load vectorizer
+    if not os.path.exists(args.vectorizer_path):
+        raise FileNotFoundError(f"⚠️ Vectorizer file not found: {args.vectorizer_path}")
+    vectorizer = joblib.load(args.vectorizer_path)
+    # Step 2: Process job description
+    cleaned_job_text = clean_text(raw_job_text)
+    job_vector = vectorizer.transform([cleaned_job_text])
+    # Step 3: Process resumes
+    cleaned_resumes = {fname: clean_text(txt) for fname, txt in raw_resume_texts.items()}
+    resume_matrix = vectorizer.transform(cleaned_resumes.values())
+    # Step 4: Compute similarity
+    sims = cosine_similarity(job_vector, resume_matrix)[0]
+    if args.debug:
+        print("\n================ DEBUG MODE ================")
+        print("\n[DEBUG - TFIDF] Cleaned job description:")
+        print(cleaned_job_text[:500], "...\n")
+        print("[DEBUG - TFIDF] First 3 cleaned resumes:")
+        for i, (fname, txt) in enumerate(cleaned_resumes.items()):
+            if i >= 3: break
+            print(f"{fname}: {txt[:300]}...\n")
+        print("[DEBUG - TFIDF] Raw similarity scores:", sims[:10])
+        print("==============================================")
+    # Step 5: Rank resumes
+    ranked = sorted(zip(cleaned_resumes.keys(), sims), key=lambda x: x[1], reverse=True)
+    return ranked
+def rank_with_bert(args, raw_job_text, raw_resume_texts):
+    """BERT recruiter pipeline using FAISS (on the fly)"""
+    if not os.path.exists(args.bert_model_path):
+        raise FileNotFoundError(f"⚠️ BERT model not found: {args.bert_model_path}")
+    # Step 1: Load BERT model
+    model = SentenceTransformer(args.bert_model_path)
+    # Step 2: Encode job description
+    job_embedding = model.encode([raw_job_text], convert_to_numpy=True, normalize_embeddings=True)
+    # Step 3: Encode resumes
+    resume_embeddings = model.encode(list(raw_resume_texts.values()), convert_to_numpy=True, normalize_embeddings=True)
+    # Step 4: Create FAISS indices
+    local_index = faiss.IndexFlatIP(resume_embeddings.shape[1])
+    local_index.add(resume_embeddings)
+    scores, indices = local_index.search(job_embedding, len(raw_resume_texts))
+    if args.debug:
+        print("\n================ DEBUG MODE ================")
+        print("\n[DEBUG - BERT/FAISS] Raw job description:")
+        print(raw_job_text[:500], "...\n")
+        print("[DEBUG - BERT/FAISS] First 3 raw resumes:")
+        for i, (fname, txt) in enumerate(raw_resume_texts.items()):
+            if i >= 3: break
+            print(f"{fname}: {txt[:300]}...\n")
+        print(f"[DEBUG - BERT/FAISS] all similarity scores:", scores[0][:len(raw_resume_texts)])
+        print("==============================================")
+    # Step 5: Rank resumes
+    ranked = [(list(raw_resume_texts.keys())[i], float(scores[0][j]))
+              for j, i in enumerate(indices[0])]
+    return ranked
+def main(args):
+    try:
+        # Load raw job and resumes
+        raw_job_text = extract_text_from_file(args.job_desc_path)
+        raw_resume_texts = bulk_load_raw_resume_files(args.resume_dir)
+        if not raw_resume_texts:
+            raise ValueError("⚠️ No valid resumes found in the given directory.")
+        # Limit the number of resumes displayed based on the top_k argument and available resumes
+        available_resumes = len(raw_resume_texts)
+        top_k = min(args.top_k, available_resumes)
+        if args.top_k > available_resumes:
+            print(f"\n⚠️ Only {available_resumes} resumes are available. "
+                  f"Showing top {available_resumes} matches instead of {args.top_k}.\n")
+        # Choose model
+        if args.model == "tfidf":
+            ranked = rank_with_tfidf(args, raw_job_text, raw_resume_texts)
+        elif args.model == "bert":
+            ranked = rank_with_bert(args, raw_job_text, raw_resume_texts)
+        else:
+            raise ValueError("❌ Invalid model. Choose 'tfidf' or 'bert'.")
+        # Display ranked resumes
+        print(f"\n🎯 Top {top_k} Candidate Matches for the Job ({args.model.upper()}):")
+        for i, (fname, score) in enumerate(ranked[:top_k], 1):
+            print(f"{i}. {fname}  → score: {score:.4f}")
     except Exception as e:
         print(f"❌ Error: {str(e)}")
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Recruiter Pipeline: Rank user uploaded resumes for a given job description")
+    # Shared arguments
     parser.add_argument('--job_desc_path', type=str, required=True, help="Path to job description file")
     parser.add_argument('--resume_dir', type=str, required=True, help="Directory containing applicant resumes")
+    parser.add_argument('--model', type=str, choices=['tfidf', 'bert'], default='tfidf',
+                        help="Model to use: tfidf or bert")
     parser.add_argument('--top_k', type=int, default=10, help="Number of top resumes to return")
+    parser.add_argument('--debug', action='store_true', help="Print cleaned/raw texts and raw similarity scores")
+    # TF-IDF specific
+    parser.add_argument('--vectorizer_path', type=str,
+                        default='models/tfidf/recruiter_tfidf/combined_tfidf_vectorizer.pkl',
+                        help="Path to pre-trained TF-IDF vectorizer")
+    # BERT specific
+    parser.add_argument('--bert_model_path', type=str,
+                        default='models/bert/dapt_minilm_sentence_transformer',
+                        help="Path to fine-tuned BERT/SBERT model")
     args = parser.parse_args()
     main(args)

src/feature_engg/tfidf_vectorizing_data.py CHANGED Viewed

@@ -27,14 +27,14 @@ def get_combined_tfidf_vectorizer(max_features: int = 40000,
     Creates a TF-IDF vectorizer with specified parameters for larger vocab with both Jobs and Resume.
     """
     return TfidfVectorizer(
-           stop_words="english",
-           lowercase=True,
            max_features=max_features,       # Balanced for resumes + jobs
-           ngram_range=ngram_range,
-           min_df=5,
-           max_df=0.85,
            sublinear_tf=True,               # Smooth term frequency scaling
-           norm="l2"
 )
 def save_vectorizer(vectorizer: TfidfVectorizer,

     Creates a TF-IDF vectorizer with specified parameters for larger vocab with both Jobs and Resume.
     """
     return TfidfVectorizer(
+           stop_words="english",            # Remove common English stopwords
+           lowercase=True,                  # Convert all to lowercase
            max_features=max_features,       # Balanced for resumes + jobs
+           ngram_range=ngram_range,         # By default Unigrams + Bigrams
+           min_df=5,                        # Ignore very rare words
+           max_df=0.85,                     # Ignore very common words
            sublinear_tf=True,               # Smooth term frequency scaling
+           norm="l2"                        # Normalize for cosine similarity
 )
 def save_vectorizer(vectorizer: TfidfVectorizer,

src/utils/bulk_loading.py CHANGED Viewed

@@ -1,21 +1,20 @@
 import os
-from typing import List, Dict, Union
-from src.processing.text_cleaning import clean_text  # assuming you already have this
 from src.utils.file_reader import extract_text_from_pdf, extract_text_from_docx, extract_text_from_txt
-def bulk_load_cleaned_resume_files(input_path: Union[str, List[str]]) -> Dict[str, str]:
     """
-    Load multiple resumes from a directory or list of files.
     Args:
         input_path : str or List[str]
             Either:
             - A path to a directory containing resume files, OR
-            - A list of individual file paths.
     Returns:
-        Dict[str, str]: Dictionary mapping file's basenames -> cleaned text.
     """
     resumes = {}
@@ -26,9 +25,11 @@ def bulk_load_cleaned_resume_files(input_path: Union[str, List[str]]) -> Dict[st
             for f in os.listdir(input_path)
             if f.lower().endswith((".pdf", ".docx", ".txt"))
         ]
     # Case 2: list of files
     elif isinstance(input_path, list):
         file_paths = input_path
     # Case 3: single file
     elif isinstance(input_path, str) and os.path.isfile(input_path):
         file_paths = [input_path]
@@ -49,13 +50,12 @@ def bulk_load_cleaned_resume_files(input_path: Union[str, List[str]]) -> Dict[st
             else:
                 print(f"⚠️ Skipping unsupported file type: {path}")
                 continue
-            cleaned = clean_text(text)
-            resumes[os.path.basename(path)] = cleaned
         except Exception as e:
             print(f"❌ Error processing {path}: {e}")
     print(f"✅ Loaded {len(resumes)} resumes.")
     return resumes

 import os
+from typing import List, Union
 from src.utils.file_reader import extract_text_from_pdf, extract_text_from_docx, extract_text_from_txt
+def bulk_load_raw_resume_files(input_path: Union[str, List[str]]):
     """
+    Load multiple resumes from a directory, a list of files, or a single file.
     Args:
         input_path : str or List[str]
             Either:
             - A path to a directory containing resume files, OR
+            - A list of individual file paths, OR
+            - A single file path.
     Returns:
+        Dict[str, str]: Dictionary mapping file's basenames -> raw text.
     """
     resumes = {}
             for f in os.listdir(input_path)
             if f.lower().endswith((".pdf", ".docx", ".txt"))
         ]
     # Case 2: list of files
     elif isinstance(input_path, list):
         file_paths = input_path
     # Case 3: single file
     elif isinstance(input_path, str) and os.path.isfile(input_path):
         file_paths = [input_path]
             else:
                 print(f"⚠️ Skipping unsupported file type: {path}")
                 continue
+            # ✅ Add to dictionary
+            resumes[os.path.basename(path)] = text
         except Exception as e:
             print(f"❌ Error processing {path}: {e}")
     print(f"✅ Loaded {len(resumes)} resumes.")
     return resumes