import pandas as pd import os from pathlib import Path from huggingface_hub import hf_hub_download from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer, load_tfidf_matrix from src.feature_engg.bert_embedding_data import load_bert_model, load_faiss_index from src.processing.text_cleaning import clean_text, clean_text_for_bert from src.matching.matching_engine import compute_similarity_matrix, top_n_tfidf_matches, top_n_bert_matches # Defining paths for data files PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent def load_job_titles(local_path=None, repo_id=None, filename=None): """ Load job titles, preferring a local path if provided, otherwise downloading from the Hugging Face Hub. """ file_path = "" if local_path and os.path.exists(local_path): print(f"šŸ“‚ Using local job titles from {local_path}") file_path = local_path elif repo_id and filename: print(f"🌐 Downloading job titles from Hugging Face Hub ({repo_id}/{filename})") file_path = hf_hub_download(repo_id=repo_id, filename=filename) else: raise ValueError("Must provide either a valid local_path or repo_id and filename.") df = pd.read_csv(file_path) if "title" not in df.columns: raise ValueError("Job CSV must contain a 'title' column.") return df def run_tfidf_pipeline(raw_resume: str, *, vectorizer=None, job_matrix=None, job_df=None, local_vectorizer_path=None, local_matrix_path=None, repo_id="Om-Shandilya/resume-matcher-tfidf", vectorizer_filename="applicant/job_vectorizer.pkl", matrix_filename="applicant/job_matrix.npz", top_k=None, debug=False): """Return top-N matches using TF-IDF pipeline. Args: raw_resume (str): Raw text of the resume. vectorizer (TfidfVectorizer, optional): Preloaded TF-IDF vectorizer. job_matrix (scipy.sparse matrix, optional): Preloaded TF-IDF job matrix. job_df (pd.DataFrame, optional): DataFrame of job titles. local_vectorizer_path (str, optional): Local path to TF-IDF vectorizer. local_matrix_path (str, optional): Local path to TF-IDF matrix. repo_id (str): Hugging Face repo ID for vectorizer/matrix. vectorizer_filename (str): Filename of the vectorizer in the repo. matrix_filename (str): Filename of the matrix in the repo. top_k (int, optional): Number of top matches to return. If None, return all. debug (bool, optional): Print raw similarity scores for both and cleaned resume. Returns: List[Tuple[str, float]]: List of (job_title, score) for top_k matches. """ cleaned_resume = clean_text(raw_resume) if vectorizer is None: vectorizer = load_tfidf_vectorizer(local_vectorizer_path, repo_id, vectorizer_filename) if job_matrix is None: job_matrix = load_tfidf_matrix(local_matrix_path, repo_id, matrix_filename) resume_vector = vectorizer.transform([cleaned_resume]) sim_matrix = compute_similarity_matrix(resume_vector, job_matrix) if job_df is None: job_df = load_job_titles(repo_id='Om-Shandilya/resume-matcher-tfidf', filename='applicant/tfidf_job_titles.csv') total_jobs = len(job_df['title'].unique()) message = "" if top_k is None: final_top_k = total_jobs message = f"āœ… Showing all {total_jobs} job matches, ranked by relevance." elif top_k > total_jobs: final_top_k = total_jobs message = f"ā„¹ļø You requested {top_k} matches, but only {total_jobs} are available. Showing all {total_jobs} matches." else: final_top_k = top_k message = f"āœ… Showing the top {final_top_k} job matches." matches = top_n_tfidf_matches(sim_matrix, top_n=final_top_k, job_df=job_df) if debug: print("\n================ DEBUG MODE ================") print("\nšŸ“„--- [DEBUG - TFIDF] Cleaned Resume Preview:\n", cleaned_resume[:1000], "---") print(f"\n--- [DEBUG - TFIDF] Raw Similarity Scores (top {final_top_k}) ---") for job_idx, score in matches[0]: print(f"[{job_idx}] {job_df.iloc[job_idx]['title']} → {score:0.6f}") print("==============================================") return [(job_df.iloc[j]['title'], score) for j, score in matches[0]], message def run_bert_pipeline(raw_resume: str, *, model=None, job_index=None, job_df=None, local_bert_path=None, local_index_path=None, repo_id="Om-Shandilya/resume-matcher-bert", index_filename="applicant/jobs.faiss", top_k=None, debug=False): """Return top-N matches using BERT + FAISS pipeline. Args: raw_resume (str): Raw text of the resume. model (SentenceTransformer, optional): Preloaded BERT model. job_index (faiss.Index, optional): Preloaded FAISS index. job_df (pd.DataFrame, optional): DataFrame of job titles. local_bert_path (str, optional): Local path to BERT model. local_index_path (str, optional): Local path to FAISS index. repo_id (str): Hugging Face repo ID for model/index. index_filename (str): Filename of the FAISS index in the repo. top_k (int, optional): Number of top matches to return. If None, return all. debug (bool, optional): Print raw similarity scores for both and cleaned resume. Returns: List[Tuple[str, float]]: List of (job_title, score) for top_k matches. """ if model is None: model = load_bert_model(local_bert_path=local_bert_path, repo_id=repo_id) if job_index is None: job_index = load_faiss_index(local_index_path, repo_id, index_filename) cleaned_resume = clean_text_for_bert(raw_resume) resume_embedding = model.encode([cleaned_resume], normalize_embeddings=True) D, I = job_index.search(resume_embedding, job_index.ntotal) if job_df is None: job_df = load_job_titles(repo_id='Om-Shandilya/resume-matcher-bert', filename='applicant/bert_job_titles.csv') total_jobs = len(job_df['title'].unique()) message = "" if top_k is None: final_top_k = total_jobs message = f"āœ… Showing all {total_jobs} job matches, ranked by relevance." elif top_k > total_jobs: final_top_k = total_jobs message = f"ā„¹ļø You requested {top_k} matches, but only {total_jobs} are available. Showing all {total_jobs} matches." else: final_top_k = top_k message = f"āœ… Showing the top {final_top_k} job matches." matches = top_n_bert_matches(I, D, job_df, top_n=final_top_k) if debug: print("\n================ DEBUG MODE ================") print(f"\n--- [DEBUG - BERT/FAISS] Raw Similarity Scores (top {final_top_k}) ---") for idx, score in matches: print(f"[{idx}] {job_df.iloc[idx]['title']} → {score:0.6f}") print("==============================================") return [(job_df.iloc[i]['title'], score) for i, score in matches], message