Spaces:

Om-Shandilya
/

resume-matcher-app

Running

App Files Files Community

Om-Shandilya commited on 15 days ago

Commit

0ad99b7

1 Parent(s): 92feaec

Add GUI and Refactor Pipelines for reusability

Browse files

Files changed (10) hide show

.gitignore +1 -0
gui/app.py +202 -0
pipelines/app_pipeline.py +0 -179
pipelines/applicant_pipeline.py +59 -0
pipelines/core/applicant.py +125 -0
pipelines/core/recruiter.py +100 -0
pipelines/recruiter_pipeline.py +34 -128
src/feature_engg/bert_embedding_data.py +40 -20
src/feature_engg/tfidf_vectorizing_data.py +13 -10
src/utils/file_reader.py +3 -0

.gitignore CHANGED Viewed

@@ -215,4 +215,5 @@ data/processed/*.txt
 data/raw/*/*csv
 data/saved_plots/
 models/
 tests/

 data/raw/*/*csv
 data/saved_plots/
 models/
+__pycache__/
 tests/

gui/app.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import streamlit as st
+import os
+import tempfile
+import pandas as pd
+import shutil
+import sys
+import altair as alt
+sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
+from src.utils.bulk_loading import bulk_load_raw_resume_files
+from src.utils.file_reader import extract_text_from_file
+from pipelines.core.applicant import run_tfidf_pipeline as applicant_tfidf, run_bert_pipeline as applicant_bert
+from pipelines.core.recruiter import rank_with_tfidf as recruiter_tfidf, rank_with_bert as recruiter_bert
+# --- App Configuration ---
+st.set_page_config(
+    page_title="Resume-Job Matcher",
+    page_icon="📄",
+    layout="wide"
+)
+# --- Main App ---
+st.title("🎯 AI-Powered Resume-Job Matcher")
+st.write("---")
+# --- Sidebar for Mode Selection ---
+with st.sidebar:
+    st.header("Controls")
+    app_mode = st.radio(
+        "Choose your view",
+        ("Applicant", "Recruiter"),
+        help="Select 'Applicant' to match your resume to jobs. Select 'Recruiter' to rank resumes for a job."
+    )
+    model_choice = st.selectbox(
+        "Choose the AI Model",
+        ("TF-IDF", "BERT"),
+        help="TF-IDF is faster. BERT is more accurate."
+    )
+    st.write("---")
+    # Add a checkbox to control the 'show all' feature
+    show_all = st.checkbox("Show all matches", value=False)
+    if show_all:
+        top_k = None
+        # Disable the slider when 'show_all' is checked for better UX
+        st.slider(
+            "Number of matches to show",
+            min_value=1, max_value=50, value=5, step=1,
+            disabled=True
+        )
+        st.info("Showing all ranked results.")
+    else:
+        # Enable the slider when 'show_all' is unchecked
+        top_k = st.slider(
+            "Number of matches to show",
+            min_value=1, max_value=50, value=5, step=1,
+            disabled=False
+        )
+# --- Applicant View ---
+if app_mode == "Applicant":
+    st.header("Applicant: Match Your Resume to a Job")
+    resume_file = st.file_uploader(
+        "Upload your resume",
+        type=['pdf', 'docx', 'txt'],
+        help="Please upload your resume in PDF, DOCX, or TXT format."
+    )
+    if resume_file:
+        st.success(f"✅ Successfully uploaded `{resume_file.name}`")
+        if st.button("Find Top Job Matches", type="primary", use_container_width=True):
+            with st.spinner(f"Analyzing resume with {model_choice}..."):
+                tmp_file_path = None
+                try:
+                    with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(resume_file.name)[1]) as tmp_file:
+                        tmp_file.write(resume_file.getvalue())
+                        tmp_file_path = tmp_file.name
+                    raw_resume_text = extract_text_from_file(tmp_file_path)
+                    if model_choice == "BERT":
+                        matches, message = applicant_bert(raw_resume_text, top_k=top_k)
+                    else:
+                        matches, message = applicant_tfidf(raw_resume_text, top_k=top_k)
+                    if not matches:
+                        st.warning("⚠️ No suitable job matches found.")
+                    else:
+                        st.subheader(f"Top {len(matches)} Job Matches:")
+                        st.info(message)
+                        df = pd.DataFrame(matches, columns=["Job Title", "Match Score"])
+                        # Sort the DataFrame by 'Match Score' in descending order to show best matches at the top
+                        df = df.sort_values(by="Match Score", ascending=False).reset_index(drop=True)
+                        chart = alt.Chart(df).mark_bar().encode(
+                            y=alt.Y('Job Title', sort='-x', title=None),
+                            x=alt.X('Match Score', axis=None, scale=alt.Scale(domainMin=0)),
+                            # Tooltip to reveal score on hover
+                            tooltip=['Job Title', alt.Tooltip('Match Score', format='.3f')]
+                        ).properties(
+                            # Set a responsive title for the chart to indicate what the bars represent
+                            title="Relative Job Match Scores"
+                        ).interactive()
+                        st.altair_chart(chart, use_container_width=True)
+                except Exception as e:
+                    st.error(f"An error occurred: {e}")
+                finally:
+                    if tmp_file_path and os.path.exists(tmp_file_path):
+                        os.unlink(tmp_file_path)
+# --- Recruiter View ---
+if app_mode == "Recruiter":
+    st.header("Recruiter: Rank Resumes for a Job Description")
+    job_desc_file = st.file_uploader(
+        "Upload the job description",
+        type=['pdf', 'docx', 'txt'],
+        help="Upload the job description in PDF, DOCX, or TXT format."
+    )
+    resume_files = st.file_uploader(
+        "Upload candidate resumes",
+        type=['pdf', 'docx', 'txt'],
+        accept_multiple_files=True,
+        help="Upload one or more resumes."
+    )
+    if job_desc_file and resume_files:
+        st.success(f"✅ Successfully uploaded job description `{job_desc_file.name}` and {len(resume_files)} resumes.")
+        if st.button("Rank Resumes", type="primary", use_container_width=True):
+            with st.spinner(f"Ranking {len(resume_files)} resumes with {model_choice}..."):
+                # Paths for cleanup in the finally block
+                temp_dir = None
+                job_desc_path = None
+                try:
+                    # 1. Handle the single job description file
+                    with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(job_desc_file.name)[1]) as tmp_file:
+                        tmp_file.write(job_desc_file.getvalue())
+                        job_desc_path = tmp_file.name
+                    raw_job_text = extract_text_from_file(job_desc_path)
+                    # 2. Handle multiple resume files by creating a temp directory for bulk loading
+                    temp_dir = tempfile.mkdtemp()
+                    for resume_file in resume_files:
+                        resume_path = os.path.join(temp_dir, resume_file.name)
+                        with open(resume_path, "wb") as f:
+                            f.write(resume_file.getbuffer())
+                    # Bulk loading all resumes from the temp directory
+                    raw_resume_texts = bulk_load_raw_resume_files(temp_dir)
+                    # 3. Call the appropriate model's pipeline based on the model choice (default to TF-IDF)
+                    if model_choice == "BERT":
+                        ranked_resumes, message = recruiter_bert(raw_job_text, raw_resume_texts, top_k=top_k)
+                    else:
+                        ranked_resumes, message = recruiter_tfidf(raw_job_text, raw_resume_texts, top_k=top_k)
+                    # 4. Display results
+                    if not ranked_resumes:
+                        st.warning("⚠️ Could not rank resumes. Please check the files.")
+                    else:
+                        st.subheader(f"Top {len(ranked_resumes)} Ranked Resumes:")
+                        st.info(message)
+                        df = pd.DataFrame(ranked_resumes, columns=["Resume", "Match Score"])
+                        df["Match Score"] = df["Match Score"].apply(lambda x: min(1.0, x))
+                        st.dataframe(
+                            df,
+                            column_config={"Resume": st.column_config.TextColumn("Resume"),
+                                           "Match Score": st.column_config.ProgressColumn("Match Score",
+                                                format="%.2f",
+                                                min_value=0,
+                                                max_value=1,),
+                                           },
+                                            use_container_width=True,
+                                            hide_index=True,
+                        )
+                except Exception as e:
+                    st.error(f"⚠️An error occurred: {e}")
+                finally:
+                    # 5. Clean up all temporary files and the directory
+                    if job_desc_path and os.path.exists(job_desc_path):
+                        os.unlink(job_desc_path)
+                    if temp_dir and os.path.exists(temp_dir):
+                        shutil.rmtree(temp_dir)

pipelines/app_pipeline.py DELETED Viewed

@@ -1,179 +0,0 @@
-import argparse
-import os
-import pandas as pd
-from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer, load_tfidf_matrix
-from src.feature_engg.bert_embedding_data import get_bert_model, load_faiss_index
-from src.processing.text_cleaning import clean_text, clean_text_for_bert
-from src.matching.matching_engine import compute_similarity_matrix, top_n_tfidf_matches, top_n_bert_matches
-from src.utils.file_reader import extract_text_from_file
-def load_job_titles(job_csv_path: str):
-    df = pd.read_csv(job_csv_path)
-    if "title" not in df.columns:
-        raise ValueError("Job CSV must contain a 'title' column.")
-    return df
-# ------------------------- TF-IDF PIPELINE -------------------------
-def run_tfidf_pipeline(args, raw_resume: str):
-    # Step 1: Clean resume
-    cleaned_resume = clean_text(raw_resume)
-    # Step 2: Load vectorizer + job matrix (local first, fallback HF)
-    vectorizer = load_tfidf_vectorizer(
-        local_vectorizer_path=args.local_vectorizer_path,
-        repo_id=args.tfidf_repo_id,
-        filename=args.vectorizer_filename
-    )
-    job_matrix = load_tfidf_matrix(
-        local_matrix_path=args.local_matrix_path,
-        repo_id=args.tfidf_repo_id,
-        filename=args.matrix_filename
-    )
-    # Step 3: Vectorize resume
-    resume_vector = vectorizer.transform([cleaned_resume])
-    # Step 4: Compute cosine similarity
-    sim_matrix = compute_similarity_matrix(resume_vector, job_matrix)
-    # Step 5: Load job titles
-    job_df = load_job_titles("data/app_data/tfidf_job_titles.csv")
-    # Step 6: Get top-N job matches
-    top_k = args.top_k
-    if args.top_k > len(job_df['title'].unique()):
-        print(f"⚠️ Requested top_k={args.top_k} exceeds unique job titles={len(job_df['title'].unique())}. Reducing top_k.")
-        top_k = len(job_df['title'].unique())
-    elif args.top_k is None:
-        top_k = len(job_df['title'].unique())
-        print(f"\nℹ️ Showing all {top_k} job titles.\n")
-    matches = top_n_tfidf_matches(sim_matrix, top_n=top_k, job_df=job_df)
-    print(f"\n🎯 Top {top_k} Job Matches for the Resume (TF-IDF):")
-    for job_idx, score in matches[0]:
-        print(f"🔹 {job_df.iloc[job_idx]['title']} (score: {score:0.4f})")
-    if args.debug:
-        print("\n================ DEBUG MODE ================")
-        print("\n📄--- [DEBUG - TFIDF] Cleaned Resume Preview:\n", cleaned_resume[:1000], "---")
-        print(f"\n--- [DEBUG - TFIDF] Raw Similarity Scores (top {top_k}) ---")
-        for job_idx, score in matches[0]:
-            print(f"[{job_idx}] {job_df.iloc[job_idx]['title']} → {score:0.6f}")
-        print("==============================================")
-# ------------------------- BERT PIPELINE -------------------------
-def run_bert_pipeline(args, raw_resume: str):
-    # Step 1: Load fine-tuned ST model (local or HF Hub)
-    model = get_bert_model(args.local_bert_path or args.bert_repo_id)
-    # Step 2: Load FAISS index (local or HF Hub)
-    job_index = load_faiss_index(
-        local_index_path=args.local_index_path,
-        repo_id=args.bert_repo_id,
-        filename=args.index_filename
-    )
-    # Step 3: Clean resume text for transformer
-    cleaned_resume = clean_text_for_bert(raw_resume)
-    # Step 4: Embed
-    resume_embedding = model.encode(
-        [cleaned_resume],
-        normalize_embeddings=True
-    )
-    # Step 5: Search
-    n_jobs = job_index.ntotal
-    D, I = job_index.search(resume_embedding, n_jobs)
-    # Step 6: Load job titles
-    job_df = load_job_titles("data/app_data/bert_job_titles.csv")
-    # Step 7: Rank top-N
-    top_k = args.top_k
-    if args.top_k > len(job_df['title'].unique()):
-        print(f"⚠️ Requested top_k={args.top_k} exceeds unique job titles={len(job_df['title'].unique())}. Reducing top_k.")
-        top_k = len(job_df['title'].unique())
-    elif args.top_k is None:
-        top_k = len(job_df['title'].unique())
-        print(f"\nℹ️ Showing all {top_k} job titles.\n")
-    matches = top_n_bert_matches(I, D, job_df, top_n=top_k)
-    print(f"\n🎯 Top {top_k} Job Matches for the Resume (BERT):")
-    for idx, score in matches:
-        print(f"🔹 {job_df.iloc[idx]['title']} (score: {score:0.4f})")
-    if args.debug:
-        print("\n================ DEBUG MODE ================")
-        print(f"\n--- [DEBUG - BERT/FAISS] Raw Similarity Scores (top {top_k}) ---")
-        for idx, score in matches:
-            print(f"[{idx}] {job_df.iloc[idx]['title']} → {score:0.6f}")
-        print("==============================================")
-# ------------------------- MAIN -------------------------
-def main(args):
-    try:
-        if not os.path.exists(args.resume_path):
-            raise FileNotFoundError(f"⚠️ Resume file not found at: {args.resume_path}")
-        raw_resume = extract_text_from_file(args.resume_path)
-        print(f"\n📄 Resume: {args.resume_path}")
-        # Pipeline selector
-        print(f"⚙️ Using model: {args.model.upper()}")
-        if args.model == "bert":
-            run_bert_pipeline(args, raw_resume)
-        else:
-            run_tfidf_pipeline(args, raw_resume)
-    except Exception as e:
-        print(f"❌ Error: {str(e)}")
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Match a resume to top relevant job titles")
-    # Shared args
-    parser.add_argument("--resume_path", type=str, required=True, help="Path to resume file")
-    parser.add_argument("--model", type=str, choices=["tfidf", "bert"], default="tfidf")
-    parser.add_argument("--top_k", type=int, default=None,
-                        help="Number of top matches to return if not specified, returns all")
-    parser.add_argument("--debug", action="store_true",
-                        help="print raw similarity scores for both and cleaned resume for tfidf pipeline")
-    # TF-IDF args
-    parser.add_argument("--local_vectorizer_path", type=str, default=None,
-                        help="Local TF-IDF vectorizer .pkl file")
-    parser.add_argument("--local_matrix_path", type=str, default=None,
-                        help="Local TF-IDF job matrix .npz file")
-    parser.add_argument("--tfidf_repo_id", type=str, default="Om-Shandilya/resume-matcher-tfidf",
-                        help="Hub repo id for HuggingFace model")
-    parser.add_argument("--vectorizer_filename", type=str, default="applicant/job_vectorizer.pkl",
-                        help="Filename of vectorizer in the HF repo")
-    parser.add_argument("--matrix_filename", type=str, default="applicant/job_matrix.npz",
-                        help="Filename of matrix in the HF repo")
-    # BERT args
-    parser.add_argument("--local_bert_path", type=str, default=None,
-                            help="Local fine-tuned ST model path")
-    parser.add_argument("--local_index_path", type=str, default=None,
-                            help="Local FAISS index file path")
-    parser.add_argument("--bert_repo_id", type=str, default="Om-Shandilya/resume-matcher-bert",
-                        help="fine-tuned ST model's HF repo id")
-    parser.add_argument("--index_filename", type=str, default="applicant/jobs.faiss",
-                        help="Filename of FAISS index in the HF repo")
-    args = parser.parse_args()
-    main(args)

pipelines/applicant_pipeline.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import argparse, os
+from src.utils.file_reader import extract_text_from_file
+from pipelines.core.applicant import run_tfidf_pipeline, run_bert_pipeline
+def main(args):
+    try:
+        if not os.path.exists(args.resume_path):
+            raise FileNotFoundError(f"⚠️ Resume not found at {args.resume_path}")
+        raw_resume = extract_text_from_file(args.resume_path)
+        if args.model == "bert":
+            matches, message = run_bert_pipeline(raw_resume,
+                                        local_bert_path=args.local_bert_path,
+                                        local_index_path=args.local_index_path,
+                                        repo_id=args.bert_repo_id,
+                                        index_filename=args.index_filename,
+                                        top_k=args.top_k,
+                                        debug=args.debug)
+        else:
+            matches, message = run_tfidf_pipeline(raw_resume,
+                                        local_vectorizer_path=args.local_vectorizer_path,
+                                        local_matrix_path=args.local_matrix_path,
+                                        repo_id=args.tfidf_repo_id,
+                                        vectorizer_filename=args.vectorizer_filename,
+                                        matrix_filename=args.matrix_filename,
+                                        top_k=args.top_k,
+                                        debug=args.debug)
+        print(f"\n{message}")
+        print(f"\n🎯 Top {len(matches)} Job Matches ({args.model.upper()}):")
+        for fname, score in matches:
+            print(f"🔹 {fname} (score: {score:.4f})")
+    except Exception as e:
+        print(f"❌ Error: {str(e)}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Match a resume to top relevant job titles")
+    parser.add_argument("--resume_path", type=str, required=True)
+    parser.add_argument("--model", choices=["tfidf","bert"], default="tfidf")
+    parser.add_argument("--top_k", type=int, default=None)
+    parser.add_argument("--debug", action="store_true",
+                        help="print raw similarity scores for both and cleaned resume for tfidf pipeline")
+    # tfidf args
+    parser.add_argument("--local_vectorizer_path", type=str, default=None)
+    parser.add_argument("--local_matrix_path", type=str, default=None)
+    parser.add_argument("--tfidf_repo_id", type=str, default="Om-Shandilya/resume-matcher-tfidf")
+    parser.add_argument("--vectorizer_filename", type=str, default="applicant/job_vectorizer.pkl")
+    parser.add_argument("--matrix_filename", type=str, default="applicant/job_matrix.npz")
+    # bert args
+    parser.add_argument("--local_bert_path", type=str, default=None)
+    parser.add_argument("--local_index_path", type=str, default=None)
+    parser.add_argument("--bert_repo_id", type=str, default="Om-Shandilya/resume-matcher-bert")
+    parser.add_argument("--index_filename", type=str, default="applicant/jobs.faiss")
+    args = parser.parse_args()
+    main(args)

pipelines/core/applicant.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import pandas as pd
+from pathlib import Path
+from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer, load_tfidf_matrix
+from src.feature_engg.bert_embedding_data import load_bert_model, load_faiss_index
+from src.processing.text_cleaning import clean_text, clean_text_for_bert
+from src.matching.matching_engine import compute_similarity_matrix, top_n_tfidf_matches, top_n_bert_matches
+# Defining paths for data files
+PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
+def load_job_titles(job_csv_path: str):
+    df = pd.read_csv(job_csv_path)
+    if "title" not in df.columns:
+        raise ValueError("Job CSV must contain a 'title' column.")
+    return df
+def run_tfidf_pipeline(raw_resume: str,
+                       local_vectorizer_path=None,
+                       local_matrix_path=None,
+                       repo_id="Om-Shandilya/resume-matcher-tfidf",
+                       vectorizer_filename="applicant/job_vectorizer.pkl",
+                       matrix_filename="applicant/job_matrix.npz",
+                       top_k=None,
+                       debug=False):
+    """Return top-N matches using TF-IDF pipeline.
+    Args:
+        raw_resume (str): Raw text of the resume.
+        local_vectorizer_path (str, optional): Local path to TF-IDF vectorizer.
+        local_matrix_path (str, optional): Local path to TF-IDF matrix.
+        repo_id (str): Hugging Face repo ID for vectorizer/matrix.
+        vectorizer_filename (str): Filename of the vectorizer in the repo.
+        matrix_filename (str): Filename of the matrix in the repo.
+        top_k (int, optional): Number of top matches to return. If None, return all.
+        debug (bool, optional): Print raw similarity scores for both and cleaned resume.
+    Returns:
+        List[Tuple[str, float]]: List of (job_title, score) for top_k matches.
+    """
+    cleaned_resume = clean_text(raw_resume)
+    vectorizer = load_tfidf_vectorizer(local_vectorizer_path, repo_id, vectorizer_filename)
+    job_matrix = load_tfidf_matrix(local_matrix_path, repo_id, matrix_filename)
+    resume_vector = vectorizer.transform([cleaned_resume])
+    sim_matrix = compute_similarity_matrix(resume_vector, job_matrix)
+    job_df = load_job_titles(PROJECT_ROOT / "data/app_data/tfidf_job_titles.csv")
+    total_jobs = len(job_df['title'].unique())
+    message = ""
+    if top_k is None:
+        final_top_k = total_jobs
+        message = f"✅ Showing all {total_jobs} job matches, ranked by relevance."
+    elif top_k > total_jobs:
+        final_top_k = total_jobs
+        message = f"ℹ️ You requested {top_k} matches, but only {total_jobs} are available. Showing all {total_jobs} matches."
+    else:
+        final_top_k = top_k
+        message = f"✅ Showing the top {final_top_k} job matches."
+    matches = top_n_tfidf_matches(sim_matrix, top_n=final_top_k, job_df=job_df)
+    if debug:
+        print("\n================ DEBUG MODE ================")
+        print("\n📄--- [DEBUG - TFIDF] Cleaned Resume Preview:\n", cleaned_resume[:1000], "---")
+        print(f"\n--- [DEBUG - TFIDF] Raw Similarity Scores (top {final_top_k}) ---")
+        for job_idx, score in matches[0]:
+            print(f"[{job_idx}] {job_df.iloc[job_idx]['title']} → {score:0.6f}")
+        print("==============================================")
+    return [(job_df.iloc[j]['title'], score) for j, score in matches[0]],message
+def run_bert_pipeline(raw_resume: str,
+                      local_bert_path=None,
+                      local_index_path=None,
+                      repo_id="Om-Shandilya/resume-matcher-bert",
+                      index_filename="applicant/jobs.faiss",
+                      top_k=None,
+                      debug=False):
+    """Return top-N matches using BERT + FAISS pipeline.
+    Args:
+        raw_resume (str): Raw text of the resume.
+        local_bert_path (str, optional): Local path to BERT model.
+        local_index_path (str, optional): Local path to FAISS index.
+        repo_id (str): Hugging Face repo ID for model/index.
+        index_filename (str): Filename of the FAISS index in the repo.
+        top_k (int, optional): Number of top matches to return. If None, return all.
+        debug (bool, optional): Print raw similarity scores for both and cleaned resume.
+    Returns:
+        List[Tuple[str, float]]: List of (job_title, score) for top_k matches.
+    """
+    model = load_bert_model(local_bert_path=local_bert_path, repo_id=repo_id)
+    job_index = load_faiss_index(local_index_path, repo_id, index_filename)
+    cleaned_resume = clean_text_for_bert(raw_resume)
+    resume_embedding = model.encode([cleaned_resume], normalize_embeddings=True)
+    D, I = job_index.search(resume_embedding, job_index.ntotal)
+    job_df = load_job_titles(PROJECT_ROOT / "data/app_data/bert_job_titles.csv")
+    total_jobs = len(job_df['title'].unique())
+    message = ""
+    if top_k is None:
+        final_top_k = total_jobs
+        message = f"✅ Showing all {total_jobs} job matches, ranked by relevance."
+    elif top_k > total_jobs:
+        final_top_k = total_jobs
+        message = f"ℹ️ You requested {top_k} matches, but only {total_jobs} are available. Showing all {total_jobs} matches."
+    else:
+        final_top_k = top_k
+        message = f"✅ Showing the top {final_top_k} job matches."
+    matches = top_n_bert_matches(I, D, job_df, top_n=final_top_k)
+    if debug:
+        print("\n================ DEBUG MODE ================")
+        print(f"\n--- [DEBUG - BERT/FAISS] Raw Similarity Scores (top {final_top_k}) ---")
+        for idx, score in matches:
+            print(f"[{idx}] {job_df.iloc[idx]['title']} → {score:0.6f}")
+        print("==============================================")
+    return [(job_df.iloc[i]['title'], score) for i, score in matches], message

pipelines/core/recruiter.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer
+from src.feature_engg.bert_embedding_data import load_bert_model
+from src.processing.text_cleaning import clean_text, clean_text_for_bert
+def rank_with_tfidf(raw_job_text, raw_resume_texts, *,
+                    local_vectorizer_path=None,
+                    repo_id="Om-Shandilya/resume-matcher-tfidf",
+                    filename="recruiter/combined_vectorizer.pkl",
+                    top_k=None,
+                    debug=False):
+    """Rank resumes using TF-IDF similarity."""
+    vectorizer = load_tfidf_vectorizer(
+        local_vectorizer_path=local_vectorizer_path,
+        repo_id=repo_id,
+        filename=filename
+    )
+    cleaned_job_text = clean_text(raw_job_text)
+    job_vector = vectorizer.transform([cleaned_job_text])
+    cleaned_resumes = {fname: clean_text(txt) for fname, txt in raw_resume_texts.items()}
+    resume_matrix = vectorizer.transform(cleaned_resumes.values())
+    sims = cosine_similarity(job_vector, resume_matrix)[0]
+    ranked = sorted(zip(cleaned_resumes.keys(), sims), key=lambda x: x[1], reverse=True)
+    available_resumes = len(ranked)
+    message = ""
+    if top_k is None:
+        final_top_k = available_resumes
+        message = f"✅ Showing all {available_resumes} job matches, ranked by relevance."
+    elif top_k > available_resumes:
+        final_top_k = available_resumes
+        message = f"ℹ️ You requested {top_k} matches, but only {available_resumes} are available. Showing all {available_resumes} matches."
+    else:
+        final_top_k = top_k
+        message = f"✅ Showing the top {final_top_k} job matches."
+    if debug:
+        print("\n================ DEBUG MODE ================")
+        print("\n📄--- [DEBUG - TFIDF] Cleaned Job Description Preview:\n", cleaned_job_text[:1000], "---")
+        print("\n--- [DEBUG - TFIDF] First 3 Cleaned Resumes ---")
+        for i, (fname, txt) in enumerate(cleaned_resumes.items()):
+            if i >= 3: break
+            print(f"{fname}: {txt[:300]}...\n")
+        print("\n--- [DEBUG - TFIDF] Raw Similarity Scores ---")
+        for fname, score in ranked[:final_top_k]:
+            print(f"{fname} → {score:0.6f}")
+        print("==============================================")
+    return [(fname, score) for fname, score in ranked[:final_top_k]], message
+def rank_with_bert(raw_job_text, raw_resume_texts, *,
+                   local_bert_path=None,
+                   repo_id="Om-Shandilya/resume-matcher-bert",
+                   top_k=None,
+                   debug=False):
+    """Rank resumes using BERT embeddings."""
+    model = load_bert_model(local_bert_path=local_bert_path, repo_id=repo_id)
+    cleaned_job_text = clean_text_for_bert(raw_job_text)
+    job_embedding = model.encode([cleaned_job_text], normalize_embeddings=True)
+    cleaned_resumes = {fname: clean_text_for_bert(txt) for fname, txt in raw_resume_texts.items()}
+    resume_embeddings = model.encode(list(cleaned_resumes.values()), normalize_embeddings=True)
+    sims = np.dot(resume_embeddings, job_embedding.T).flatten()
+    ranked = sorted(zip(cleaned_resumes.keys(), sims), key=lambda x: x[1], reverse=True)
+    available_resumes = len(ranked)
+    message = ""
+    if top_k is None:
+        final_top_k = available_resumes
+        message = f"✅ Showing all {available_resumes} job matches, ranked by relevance."
+    elif top_k > available_resumes:
+        final_top_k = available_resumes
+        message = f"ℹ️  You requested {top_k} matches, but only {available_resumes} are available. Showing all {available_resumes} matches."
+    else:
+        final_top_k = top_k
+        message = f"✅ Showing the top {final_top_k} job matches."
+    if debug:
+        print("\n================ DEBUG MODE ================")
+        print("\n📄--- [DEBUG - BERT] Cleaned Job Description Preview:\n", cleaned_job_text[:1000], "---")
+        print("\n--- [DEBUG - BERT] First 3 Cleaned Resumes ---")
+        for i, (fname, txt) in enumerate(cleaned_resumes.items()):
+            if i >= 3: break
+            print(f"{fname}: {txt[:300]}...\n")
+        print("\n--- [DEBUG - BERT] Raw Similarity Scores ---")
+        for fname, score in ranked[:final_top_k]:
+            print(f"{fname} → {score:0.6f}")
+        print("==============================================")
+    return [(fname, score) for fname, score in ranked[:final_top_k]], message

pipelines/recruiter_pipeline.py CHANGED Viewed

@@ -1,137 +1,51 @@
 import argparse
 import os
-import numpy as np
-from sklearn.metrics.pairwise import cosine_similarity
-from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer
-from src.feature_engg.bert_embedding_data import get_bert_model
 from src.utils.bulk_loading import bulk_load_raw_resume_files
 from src.utils.file_reader import extract_text_from_file
-from src.processing.text_cleaning import clean_text, clean_text_for_bert
-# ------------------------- TF-IDF PIPELINE -------------------------
-def run_tfidf_pipeline(args, raw_job_text, raw_resume_texts):
-    # Step 1: Load vectorizer (local or HF Hub)
-    vectorizer = load_tfidf_vectorizer(
-        local_vectorizer_path=args.local_vectorizer_path,
-        repo_id=args.tfidf_repo_id,
-        filename=args.vectorizer_filename
-    )
-    # Step 2: Clean job description
-    cleaned_job_text = clean_text(raw_job_text)
-    job_vector = vectorizer.transform([cleaned_job_text])
-    # Step 3: Clean and vectorize resumes
-    cleaned_resumes = {fname: clean_text(txt) for fname, txt in raw_resume_texts.items()}
-    resume_matrix = vectorizer.transform(cleaned_resumes.values())
-    # Step 4: Compute similarity
-    sims = cosine_similarity(job_vector, resume_matrix)[0]
-    # Step 5: Rank resumes
-    ranked = sorted(zip(cleaned_resumes.keys(), sims), key=lambda x: x[1], reverse=True)
-    # Step 6: Top-K handling
-    top_k = args.top_k
-    available_resumes = len(ranked)
-    if args.top_k is None:
-        top_k = available_resumes
-        print(f"\nℹ️ Showing all {available_resumes} resumes.\n")
-    elif args.top_k > available_resumes:
-        top_k = available_resumes
-        print(f"\n⚠️ Requested top_k={args.top_k} exceeds available resumes={available_resumes}. Reducing top_k.\n")
-    print(f"\n🎯 Top {top_k} Candidate Matches for the Job (TF-IDF):")
-    for i, (fname, score) in enumerate(ranked[:top_k], 1):
-        print(f"{i}. {fname} → score: {score:.4f}")
-    if args.debug:
-        print("\n================ DEBUG MODE ================")
-        print("\n📄--- [DEBUG - TFIDF] Cleaned Job Description Preview:\n", cleaned_job_text[:1000], "---")
-        print("\n--- [DEBUG - TFIDF] First 3 Cleaned Resumes ---")
-        for i, (fname, txt) in enumerate(cleaned_resumes.items()):
-            if i >= 3: break
-            print(f"{fname}: {txt[:300]}...\n")
-        print(f"\n--- [DEBUG - TFIDF] Raw Similarity Scores (top {top_k}) ---")
-        for fname, score in ranked[:top_k]:
-            print(f"{fname} → {score:0.6f}")
-        print("==============================================")
-# ------------------------- BERT PIPELINE -------------------------
-def run_bert_pipeline(args, raw_job_text, raw_resume_texts):
-    # Step 1: Load fine-tuned ST model (local or HF Hub)
-    model = get_bert_model(args.local_bert_path or args.bert_repo_id)
-    # Step 2: Clean job description
-    cleaned_job_text = clean_text_for_bert(raw_job_text)
-    job_embedding = model.encode([cleaned_job_text], normalize_embeddings=True)
-    # Step 3: Encode resumes
-    cleaned_resumes = {fname: clean_text_for_bert(txt) for fname, txt in raw_resume_texts.items()}
-    resume_embeddings = model.encode(list(cleaned_resumes.values()), normalize_embeddings=True)
-    # Step 4: Compute cosine similarity manually
-    # Using dot product as embeddings are normalized and not FAISS since we have small data here.
-    sims = np.dot(resume_embeddings, job_embedding.T).flatten()
-    # Step 5: Rank resumes
-    ranked = sorted(zip(cleaned_resumes.keys(), sims), key=lambda x: x[1], reverse=True)
-    # Step 6: Top-K handling
-    top_k = args.top_k
-    available_resumes = len(ranked)
-    if args.top_k is None:
-        top_k = available_resumes
-        print(f"\nℹ️ Showing all {available_resumes} resumes.\n")
-    elif args.top_k > available_resumes:
-        top_k = available_resumes
-        print(f"\n⚠️ Requested top_k={args.top_k} exceeds available resumes={available_resumes}. Reducing top_k.\n")
-    print(f"\n🎯 Top {top_k} Candidate Matches for the Job (BERT):")
-    for i, (fname, score) in enumerate(ranked[:top_k], 1):
-        print(f"{i}. {fname} → score: {score:.4f}")
-    if args.debug:
-        print("\n================ DEBUG MODE ================")
-        print("\n📄--- [DEBUG - BERT] Cleaned Job Description Preview:\n", cleaned_job_text[:1000], "---")
-        print("\n--- [DEBUG - BERT] First 3 Cleaned Resumes ---")
-        for i, (fname, txt) in enumerate(cleaned_resumes.items()):
-            if i >= 3: break
-            print(f"{fname}: {txt[:300]}...\n")
-        print(f"\n--- [DEBUG - BERT] Raw Similarity Scores (top {top_k}) ---")
-        for fname, score in ranked[:top_k]:
-            print(f"{fname} → {score:0.6f}")
-        print("==============================================")
-# ------------------------- MAIN -------------------------
 def main(args):
     try:
-        # Load job description and resumes
         if not os.path.exists(args.job_desc_path):
             raise FileNotFoundError(f"⚠️ Job description not found: {args.job_desc_path}")
         raw_job_text = extract_text_from_file(args.job_desc_path)
         if not os.path.exists(args.resume_dir):
             raise FileNotFoundError(f"⚠️ Resume directory not found: {args.resume_dir}")
         raw_resume_texts = bulk_load_raw_resume_files(args.resume_dir)
         if not raw_resume_texts:
             raise ValueError("⚠️ No valid resumes found in the given directory.")
-        print(f"\n📄 Job Description: {args.job_desc_path}")
         print(f"📂 Loaded {len(raw_resume_texts)} resumes from {args.resume_dir}")
-        # Pipeline selector
-        print(f"⚙️ Using model: {args.model.upper()}")
         if args.model == "bert":
-            run_bert_pipeline(args, raw_job_text, raw_resume_texts)
         else:
-            run_tfidf_pipeline(args, raw_job_text, raw_resume_texts)
     except Exception as e:
         print(f"❌ Error: {str(e)}")
@@ -140,28 +54,20 @@ def main(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Recruiter Pipeline: Rank resumes for a given job description")
-    # Shared args
-    parser.add_argument("--job_desc_path", type=str, required=True, help="Path to job description file")
-    parser.add_argument("--resume_dir", type=str, required=True, help="Directory containing applicant resumes")
     parser.add_argument("--model", type=str, choices=["tfidf", "bert"], default="tfidf")
-    parser.add_argument("--top_k", type=int, default=None,
-                        help="Number of top matches to return if not specified, returns all")
-    parser.add_argument("--debug", action="store_true",
-                        help="print raw similarity scores and cleaned texts for debugging")
     # TF-IDF args
-    parser.add_argument("--local_vectorizer_path", type=str, default=None,
-                        help="Local TF-IDF vectorizer .pkl file")
-    parser.add_argument("--tfidf_repo_id", type=str, default="Om-Shandilya/resume-matcher-tfidf",
-                        help="Hub repo id for HuggingFace TF-IDF model")
-    parser.add_argument("--vectorizer_filename", type=str, default="recruiter/combined_vectorizer.pkl",
-                        help="Filename of vectorizer in the HF repo")
     # BERT args
-    parser.add_argument("--local_bert_path", type=str, default=None,
-                        help="Local fine-tuned ST model path")
-    parser.add_argument("--bert_repo_id", type=str, default="Om-Shandilya/resume-matcher-bert",
-                        help="fine-tuned ST model's HF repo id")
     args = parser.parse_args()
     main(args)

 import argparse
 import os
 from src.utils.bulk_loading import bulk_load_raw_resume_files
 from src.utils.file_reader import extract_text_from_file
+from pipelines.core import recruiter
 def main(args):
     try:
         if not os.path.exists(args.job_desc_path):
             raise FileNotFoundError(f"⚠️ Job description not found: {args.job_desc_path}")
         raw_job_text = extract_text_from_file(args.job_desc_path)
         if not os.path.exists(args.resume_dir):
             raise FileNotFoundError(f"⚠️ Resume directory not found: {args.resume_dir}")
         raw_resume_texts = bulk_load_raw_resume_files(args.resume_dir)
         if not raw_resume_texts:
             raise ValueError("⚠️ No valid resumes found in the given directory.")
+        print(f"\n📄 Loaded Job Description: {args.job_desc_path}")
         print(f"📂 Loaded {len(raw_resume_texts)} resumes from {args.resume_dir}")
+        print(f"⚙️  Using model: {args.model.upper()}")
         if args.model == "bert":
+            matches, message = recruiter.rank_with_bert(raw_job_text,
+                                                         raw_resume_texts,
+                                                         local_bert_path=args.local_bert_path,
+                                                         repo_id=args.bert_repo_id,
+                                                         top_k=args.top_k,
+                                                         debug=args.debug
+                                                         )
         else:
+            matches, message = recruiter.rank_with_tfidf(raw_job_text,
+                                                         raw_resume_texts,
+                                                         local_vectorizer_path=args.local_vectorizer_path,
+                                                         repo_id=args.tfidf_repo_id,
+                                                         filename=args.vectorizer_filename,
+                                                         top_k=args.top_k,
+                                                         debug=args.debug
+                                                         )
+        print(f"\n{message}")
+        print(f"\n🎯 Top {len(matches)} Job Matches ({args.model.upper()}):")
+        for i, (job, score) in enumerate(matches):
+            print(f"{i+1})-> {job} (score: {score:.4f})")
     except Exception as e:
         print(f"❌ Error: {str(e)}")
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Recruiter Pipeline: Rank resumes for a given job description")
+    parser.add_argument("--job_desc_path", type=str, required=True)
+    parser.add_argument("--resume_dir", type=str, required=True)
     parser.add_argument("--model", type=str, choices=["tfidf", "bert"], default="tfidf")
+    parser.add_argument("--top_k", type=int, default=None)
+    parser.add_argument("--debug", action="store_true")
     # TF-IDF args
+    parser.add_argument("--local_vectorizer_path", type=str, default=None)
+    parser.add_argument("--tfidf_repo_id", type=str, default="Om-Shandilya/resume-matcher-tfidf")
+    parser.add_argument("--vectorizer_filename", type=str, default="recruiter/combined_vectorizer.pkl")
     # BERT args
+    parser.add_argument("--local_bert_path", type=str, default=None)
+    parser.add_argument("--bert_repo_id", type=str, default="Om-Shandilya/resume-matcher-bert")
     args = parser.parse_args()
     main(args)

src/feature_engg/bert_embedding_data.py CHANGED Viewed

@@ -11,19 +11,20 @@ from transformers import AutoTokenizer, AutoModel
 from huggingface_hub import hf_hub_download
-def get_bert_model(model_name: str = "all-MiniLM-L6-v2",
                    device: str = None):
     """
     Loads a BERT-based sentence transformer model for embeddings.
     Args:
-        model_name (str): HuggingFace model name. Default is "all-MiniLM-L6-v2".
         device (str, optional): "cuda", "cpu", or None (auto-detect).
     Returns:
         SentenceTransformer: Loaded model ready for encoding.
     """
     device = device or ("cuda" if torch.cuda.is_available() else "cpu")
     return SentenceTransformer(model_name, device=device)
@@ -102,25 +103,44 @@ def bert_embed_text(df: pd.DataFrame,
 def load_faiss_index(local_index_path: str, repo_id: str, filename: str):
     """Load FAISS index, preferring local then HF Hub."""
-    if local_index_path and os.path.exists(local_index_path):
         print(f"📂 Loading local FAISS index from {local_index_path}")
-        return read_index(local_index_path)
-    else:
-        print(f"🌐 Downloading FAISS index from Hugging Face Hub ({repo_id})")
-        faiss_path = hf_hub_download(repo_id=repo_id, filename=filename)
-        return read_index(faiss_path)
-def load_bert_model(local_model_path: str, repo_id: str):
-    """Load BERT model, preferring local then HF Hub."""
-    if local_model_path and os.path.exists(local_model_path):
-        print(f"📂 Loading local BERT model from {local_model_path}")
-        tokenizer = AutoTokenizer.from_pretrained(local_model_path)
-        model = AutoModel.from_pretrained(local_model_path)
-    else:
-        print(f"🌐 Downloading BERT model from Hugging Face Hub ({repo_id})")
-        tokenizer = AutoTokenizer.from_pretrained(repo_id)
-        model = AutoModel.from_pretrained(repo_id)
-    return tokenizer, model
 def mean_pooling(model_output, attention_mask):
     """Mean pooling for sentence embeddings."""

 from huggingface_hub import hf_hub_download
+def get_bert_model(model_name: str,
                    device: str = None):
     """
     Loads a BERT-based sentence transformer model for embeddings.
     Args:
+        model_name (str): Hugging Face model name or path.
         device (str, optional): "cuda", "cpu", or None (auto-detect).
     Returns:
         SentenceTransformer: Loaded model ready for encoding.
     """
     device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"📂 Loading BERT model '{model_name}' on device: {device}")
     return SentenceTransformer(model_name, device=device)
 def load_faiss_index(local_index_path: str, repo_id: str, filename: str):
     """Load FAISS index, preferring local then HF Hub."""
+    if local_index_path:
+        if not os.path.exists(local_index_path):
+            raise FileNotFoundError(f"❌ Local FAISS index not found at {local_index_path}")
         print(f"📂 Loading local FAISS index from {local_index_path}")
+        return faiss.read_index(local_index_path)
+    print(f"🌐 Downloading FAISS index from Hugging Face Hub ({repo_id}/{filename})")
+    faiss_path = hf_hub_download(repo_id=repo_id, filename=filename)
+    return faiss.read_index(faiss_path)
+def load_bert_model(local_bert_path: str, repo_id: str='Om-Shandilya/resume-matcher-bert'):
+    """
+    Load a SentenceTransformer BERT model:
+    - If local_model_path is provided, it must be a valid path.
+    - If local_model_path is None, download from Hugging Face Hub.
+    """
+    if local_bert_path is None:
+        try:
+            print(f"🌐 Downloading BERT model from Hugging Face Hub ({repo_id})")
+            model = SentenceTransformer(repo_id)
+            return model
+        except Exception as e:
+            raise RuntimeError(f"❌ Failed to download model from Hugging Face Hub ({repo_id}). Error: {e}")
+    if not os.path.exists(local_bert_path):
+        raise FileNotFoundError(
+            f"❌ The specified local path does not exist: '{local_bert_path}'. "
+            "Please provide a correct path or set it to None to download from the Hub."
+        )
+    try:
+        print(f"📂 Loading local BERT model from {local_bert_path}")
+        model = SentenceTransformer(local_bert_path)
+        return model
+    except Exception as e:
+        raise RuntimeError(f"❌ Failed to load local model from '{local_bert_path}'. Error: {e}")
 def mean_pooling(model_output, attention_mask):
     """Mean pooling for sentence embeddings."""

src/feature_engg/tfidf_vectorizing_data.py CHANGED Viewed

@@ -111,21 +111,24 @@ def tfidf_vectorize_text(df: pd.DataFrame,
 def load_tfidf_vectorizer(local_vectorizer_path: str, repo_id: str, filename: str):
     """Load TF-IDF vectorizer, preferring local then HF Hub."""
-    if local_vectorizer_path and os.path.exists(local_vectorizer_path):
         print(f"📂 Loading local TF-IDF vectorizer from {local_vectorizer_path}")
         return joblib.load(local_vectorizer_path)
-    else:
-        print(f"🌐 Downloading TF-IDF vectorizer from Hugging Face Hub ({repo_id})")
-        vec_path = hf_hub_download(repo_id=repo_id, filename=filename)
-        return joblib.load(vec_path)
 def load_tfidf_matrix(local_matrix_path: str, repo_id: str, filename: str):
     """Load TF-IDF matrix, preferring local then HF Hub."""
-    if local_matrix_path and os.path.exists(local_matrix_path):
         print(f"📂 Loading local TF-IDF matrix from {local_matrix_path}")
         return load_npz(local_matrix_path)
-    else:
-        print(f"🌐 Downloading TF-IDF matrix from Hugging Face Hub ({repo_id})")
-        mat_path = hf_hub_download(repo_id=repo_id, filename=filename)
-        return load_npz(mat_path)

 def load_tfidf_vectorizer(local_vectorizer_path: str, repo_id: str, filename: str):
     """Load TF-IDF vectorizer, preferring local then HF Hub."""
+    if local_vectorizer_path:
+        if not os.path.exists(local_vectorizer_path):
+            raise FileNotFoundError(f"❌ Local TF-IDF vectorizer not found at {local_vectorizer_path}")
         print(f"📂 Loading local TF-IDF vectorizer from {local_vectorizer_path}")
         return joblib.load(local_vectorizer_path)
+    print(f"🌐 Downloading TF-IDF vectorizer from Hugging Face Hub ({repo_id}/{filename})")
+    vec_path = hf_hub_download(repo_id=repo_id, filename=filename)
+    return joblib.load(vec_path)
 def load_tfidf_matrix(local_matrix_path: str, repo_id: str, filename: str):
     """Load TF-IDF matrix, preferring local then HF Hub."""
+    if local_matrix_path:
+        if not os.path.exists(local_matrix_path):
+            raise FileNotFoundError(f"❌ Local TF-IDF matrix not found at {local_matrix_path}")
         print(f"📂 Loading local TF-IDF matrix from {local_matrix_path}")
         return load_npz(local_matrix_path)
+    print(f"🌐 Downloading TF-IDF matrix from Hugging Face Hub ({repo_id}/{filename})")
+    mat_path = hf_hub_download(repo_id=repo_id, filename=filename)
+    return load_npz(mat_path)

src/utils/file_reader.py CHANGED Viewed

@@ -53,10 +53,13 @@ def extract_text_from_file(file_path):
     ext = os.path.splitext(file_path)[1].lower()
     if ext == '.pdf':
         return extract_text_from_pdf(file_path)
     elif ext == '.docx':
         return extract_text_from_docx(file_path)
     elif ext == '.txt':
         return extract_text_from_txt(file_path)
     else:
         raise ValueError(f"Unsupported file type: {ext}")

     ext = os.path.splitext(file_path)[1].lower()
     if ext == '.pdf':
+        print(f"Extracting text from PDF {file_path}")
         return extract_text_from_pdf(file_path)
     elif ext == '.docx':
+        print(f"Extracting text from DOCX {file_path}")
         return extract_text_from_docx(file_path)
     elif ext == '.txt':
+        print(f"Extracting text from TXT {file_path}")
         return extract_text_from_txt(file_path)
     else:
         raise ValueError(f"Unsupported file type: {ext}")