Spaces:

Om-Shandilya
/

resume-matcher-app

Running

App Files Files Community

Om-Shandilya commited on Aug 31

Commit

a9988a0

1 Parent(s): 042558f

Add RESTful API backend and decoupled frontend

Browse files

Files changed (8) hide show

.gitignore +3 -1
backend/main.py +120 -0
backend/models.py +37 -0
environment.yml +14 -23
gui/app.py +71 -114
pipelines/core/applicant.py +30 -5
src/feature_engg/bert_embedding_data.py +21 -8
src/processing/text_cleaning.py +3 -0

.gitignore CHANGED Viewed

@@ -216,4 +216,6 @@ data/raw/*/*csv
 data/saved_plots/
 models/
 __pycache__/
-tests/

 data/saved_plots/
 models/
 __pycache__/
+tests/
+download_nltk_data.py
+nltk_data/

backend/main.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from fastapi import FastAPI, HTTPException
+from contextlib import asynccontextmanager
+import sys
+import os
+# This ensures that the backend can find your 'src' and 'pipelines' modules and also adds the parent directory to sys.path.
+PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+if PROJECT_ROOT not in sys.path:
+    sys.path.append(PROJECT_ROOT)
+from backend.models import (ResumeRequest, ApplicantResponse, JobMatch,
+                            RecruiterRequest, RecruiterResponse, ResumeMatch)
+from pipelines.core.applicant import  run_bert_pipeline, run_tfidf_pipeline, load_job_titles
+from pipelines.core.recruiter import  rank_with_bert, rank_with_tfidf
+from src.feature_engg.bert_embedding_data import load_bert_model, load_faiss_index
+from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer, load_tfidf_matrix
+# In memory storage for models (dictionary to hold all loaded models):
+ml_models = {}
+# Create a lifespan function to handle startup and shutdown events:
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """This code runs ONCE when the server starts up."""
+    print("🚀 Server starting up: Loading ML models...")
+    # Load Applicant Models
+    ml_models["bert_model"] = load_bert_model(local_bert_path=None, repo_id="Om-Shandilya/resume-matcher-bert")
+    ml_models["faiss_index"] = load_faiss_index(local_index_path=None, repo_id="Om-Shandilya/resume-matcher-bert", filename="applicant/jobs.faiss")
+    ml_models["applicant_vectorizer"] = load_tfidf_vectorizer(local_vectorizer_path=None, repo_id="Om-Shandilya/resume-matcher-tfidf", filename="applicant/job_vectorizer.pkl")
+    ml_models["applicant_matrix"] = load_tfidf_matrix(local_matrix_path=None, repo_id="Om-Shandilya/resume-matcher-tfidf", filename="applicant/job_matrix.npz")
+    # Load Recruiter Models
+    ml_models["recruiter_vectorizer"] = load_tfidf_vectorizer(local_vectorizer_path=None, repo_id="Om-Shandilya/resume-matcher-tfidf", filename="recruiter/combined_vectorizer.pkl")
+    # Load Job Titles DataFrames
+    ml_models["tfidf_job_df"] = load_job_titles(repo_id='Om-Shandilya/resume-matcher-tfidf', filename='applicant/tfidf_job_titles.csv')
+    ml_models["bert_job_df"] = load_job_titles(repo_id='Om-Shandilya/resume-matcher-bert', filename='applicant/bert_job_titles.csv')
+    print("✅ ML models loaded successfully.")
+    yield
+    # This code runs once when the server is shutting down.
+    print(" shutting down: Clearing ML models...")
+    ml_models.clear()
+# Initializing the FastAPI app
+app = FastAPI(
+    title="Resume-Job Matcher API",
+    description="An API for matching resumes to jobs and ranking candidates.",
+    lifespan=lifespan
+)
+# Creating the API endpoints:
+@app.get("/")
+def read_root():
+    return {"status": "Resume Matcher API is running."}
+# Applicant side endpoints:
+@app.post("/applicant/match/bert", response_model=ApplicantResponse)
+async def match_resume_bert(request: ResumeRequest):
+    try:
+        matches, message = run_bert_pipeline(
+            raw_resume=request.raw_text,
+            model=ml_models["bert_model"],
+            job_index=ml_models["faiss_index"],
+            job_df=ml_models["bert_job_df"],
+            top_k=request.top_k)
+        response_matches = [JobMatch(job_title=title, match_score=score) for title, score in matches]
+        return ApplicantResponse(matches=response_matches, message=message)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/applicant/match/tf-idf", response_model=ApplicantResponse)
+async def match_resume_tfidf(request: ResumeRequest):
+    try:
+        matches, message = run_tfidf_pipeline(
+            raw_resume=request.raw_text,
+            vectorizer=ml_models["applicant_vectorizer"],
+            job_matrix=ml_models["applicant_matrix"],
+            job_df=ml_models["tfidf_job_df"],
+            top_k=request.top_k)
+        response_matches = [JobMatch(job_title=title, match_score=score) for title, score in matches]
+        return ApplicantResponse(matches=response_matches, message=message)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# Recruiter side endpoints:
+@app.post("/recruiter/rank/bert", response_model=RecruiterResponse)
+async def rank_resumes_bert(request: RecruiterRequest):
+    try:
+        matches, message = rank_with_bert(
+            raw_job_text=request.raw_job_text,
+            raw_resume_texts=request.raw_resume_texts,
+            model=ml_models["bert_model"],
+            top_k=request.top_k)
+        response_matches = [ResumeMatch(resume_filename=fname, match_score=score) for fname, score in matches]
+        return RecruiterResponse(matches=response_matches, message=message)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/recruiter/rank/tf-idf", response_model=RecruiterResponse)
+async def rank_resumes_tfidf(request: RecruiterRequest):
+    try:
+        matches, message = rank_with_tfidf(
+            raw_job_text=request.raw_job_text,
+            raw_resume_texts=request.raw_resume_texts,
+            vectorizer=ml_models["recruiter_vectorizer"],
+            top_k=request.top_k)
+        response_matches = [ResumeMatch(resume_filename=fname, match_score=score) for fname, score in matches]
+        return RecruiterResponse(matches=response_matches, message=message)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

backend/models.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from pydantic import BaseModel
+from typing import List, Dict
+# Applicant Side Models:
+class ResumeRequest(BaseModel):
+    """The request body for matching a single resume."""
+    raw_text: str
+    top_k: int | None = None
+class JobMatch(BaseModel):
+    """Represents a single job match with its score."""
+    job_title: str
+    match_score: float
+class ApplicantResponse(BaseModel):
+    """The response body containing job matches and a message."""
+    matches: List[JobMatch]
+    message: str
+# Recruiter Side Models:
+class RecruiterRequest(BaseModel):
+    """The request body for ranking multiple resumes against a job description."""
+    raw_job_text: str
+    raw_resume_texts: Dict[str, str] # dict of {filename: raw_resume_text}
+    top_k: int | None = None
+class ResumeMatch(BaseModel):
+    """Represents a single ranked resume with its score."""
+    resume_filename: str
+    match_score: float
+class RecruiterResponse(BaseModel):
+    """The response body containing ranked resumes and a message."""
+    matches: List[ResumeMatch]
+    message: str

environment.yml CHANGED Viewed

@@ -1,38 +1,29 @@
-# Tested on Windows 11 with NVIDIA GPU (CUDA driver 13.0, PyTorch CUDA 12.1 build)
-# Use: conda env create -f environment.yml
 name: resume-matcher
 channels:
   - conda-forge
   - defaults
 dependencies:
   - python=3.10
   - pip
-  # Core scientific stack
-  - numpy
   - pandas
   - scikit-learn
-  - scipy
   - joblib
-  - tqdm
-  - matplotlib
-  - seaborn
-  - wordcloud
-  # NLP / ML essentials
   - faiss-cpu
-  - nltk
-  - statsmodels
   - huggingface_hub
-  # File handling
-  - openpyxl
-  - lxml
-  - pillow
-  - pyyaml
-  - python-docx
   - pdfminer.six
-  # pip-only packages
   - pip:
       - sentence-transformers
-      - transformers
-      - accelerate
-      - datasets

 name: resume-matcher
 channels:
+  - pytorch
   - conda-forge
   - defaults
 dependencies:
   - python=3.10
   - pip
+  # --- Core Application Dependencies ---
+  - fastapi
+  - uvicorn
+  - streamlit
+  - altair
   - pandas
   - scikit-learn
   - joblib
+  - pytorch
   - faiss-cpu
   - huggingface_hub
   - pdfminer.six
+  - python-docx
+  - requests
+  - nltk
+  - scipy
+  - anyio
+  # --- Pip-only packages ---
   - pip:
       - sentence-transformers
+      - python-multipart

gui/app.py CHANGED Viewed

@@ -1,29 +1,34 @@
 import streamlit as st
 import os
 import tempfile
 import pandas as pd
 import shutil
-import sys
 import altair as alt
-sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
 from src.utils.bulk_loading import bulk_load_raw_resume_files
 from src.utils.file_reader import extract_text_from_file
-from src.utils.model_loader import get_applicant_matrix, get_applicant_vectorizer, get_bert_model, get_faiss_index, get_recruiter_vectorizer
-from pipelines.core.applicant import run_tfidf_pipeline as applicant_tfidf, run_bert_pipeline as applicant_bert
-from pipelines.core.recruiter import rank_with_tfidf as recruiter_tfidf, rank_with_bert as recruiter_bert
-# --- App Configuration ---
 st.set_page_config(
     page_title="Resume-Job Matcher",
-    page_icon="📄",
     layout="wide"
 )
-# --- Main App ---
 st.title("🎯 AI-Powered Resume-Job Matcher")
 st.write("---")
-# --- Sidebar for Mode Selection ---
 with st.sidebar:
     st.header("Controls")
     app_mode = st.radio(
@@ -34,36 +39,28 @@ with st.sidebar:
     model_choice = st.selectbox(
         "Choose the AI Model",
         ("TF-IDF", "BERT"),
-        help="TF-IDF is faster. BERT is more accurate."
     )
     st.write("---")
-    # Add a checkbox to control the 'show all' feature
     show_all = st.checkbox("Show all matches", value=False)
     if show_all:
         top_k = None
-        # Disable the slider when 'show_all' is checked for better UX
         st.slider(
-            "Number of matches to show",
             min_value=1, max_value=50, value=5, step=1,
             disabled=True
         )
         st.info("Showing all ranked results.")
     else:
-        # Enable the slider when 'show_all' is unchecked
         top_k = st.slider(
-            "Number of matches to show",
             min_value=1, max_value=50, value=5, step=1,
             disabled=False
         )
-# --- Applicant View ---
 if app_mode == "Applicant":
     st.header("Applicant: Match Your Resume to a Job")
     resume_file = st.file_uploader(
         "Upload your resume",
         type=['pdf', 'docx', 'txt'],
@@ -73,150 +70,110 @@ if app_mode == "Applicant":
     if resume_file:
         st.success(f"✅ Successfully uploaded `{resume_file.name}`")
         if st.button("Find Top Job Matches", type="primary", width='stretch'):
-            with st.spinner(f"Analyzing resume with {model_choice}..."):
                 tmp_file_path = None
                 try:
                     with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(resume_file.name)[1]) as tmp_file:
                         tmp_file.write(resume_file.getvalue())
                         tmp_file_path = tmp_file.name
                     raw_resume_text = extract_text_from_file(tmp_file_path)
-                    if model_choice == "BERT":
-                        bert_model = get_bert_model()
-                        faiss_index = get_faiss_index()
-                        matches, message = applicant_bert(raw_resume_text,
-                                                          model=bert_model,
-                                                          job_index=faiss_index,
-                                                          top_k=top_k,)
-                    else:
-                        applicant_vectorizer = get_applicant_vectorizer()
-                        applicant_matrix = get_applicant_matrix()
-                        matches, message = applicant_tfidf(raw_resume_text,
-                                                           vectorizer=applicant_vectorizer,
-                                                           job_matrix=applicant_matrix,
-                                                           top_k=top_k)
                     if not matches:
                         st.warning("⚠️ No suitable job matches found.")
                     else:
-                        st.subheader(f"Top {len(matches)} Job Matches:")
                         st.info(message)
-                        df = pd.DataFrame(matches, columns=["Job Title", "Match Score"])
-                        # Sort the DataFrame by 'Match Score' in descending order to show best matches at the top
-                        df = df.sort_values(by="Match Score", ascending=False).reset_index(drop=True)
                         chart = alt.Chart(df).mark_bar().encode(
-                            y=alt.Y('Job Title', sort='-x', title=None),
-                            x=alt.X('Match Score', axis=None, scale=alt.Scale(domainMin=0)),
-                            # Tooltip to reveal score on hover
-                            tooltip=['Job Title', alt.Tooltip('Match Score', format='.3f')]
-                        ).properties(
-                            # Set a responsive title for the chart to indicate what the bars represent
-                            title="Relative Job Match Scores"
-                        ).interactive()
                         st.altair_chart(chart, use_container_width=True)
                 except Exception as e:
                     st.error(f"An error occurred: {e}")
                 finally:
                     if tmp_file_path and os.path.exists(tmp_file_path):
                         os.unlink(tmp_file_path)
-# --- Recruiter View ---
 if app_mode == "Recruiter":
     st.header("Recruiter: Rank Resumes for a Job Description")
-    job_desc_file = st.file_uploader(
-        "Upload the job description",
-        type=['pdf', 'docx', 'txt'],
-        help="Upload the job description in PDF, DOCX, or TXT format."
-    )
-    resume_files = st.file_uploader(
-        "Upload candidate resumes",
-        type=['pdf', 'docx', 'txt'],
-        accept_multiple_files=True,
-        help="Upload one or more resumes."
-    )
     if job_desc_file and resume_files:
-        st.success(f"✅ Successfully uploaded job description `{job_desc_file.name}` and {len(resume_files)} resumes.")
         if st.button("Rank Resumes", type="primary", width='stretch'):
-            with st.spinner(f"Ranking {len(resume_files)} resumes with {model_choice}..."):
-                # Paths for cleanup in the finally block
                 temp_dir = None
                 job_desc_path = None
                 try:
-                    # 1. Handle the single job description file
                     with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(job_desc_file.name)[1]) as tmp_file:
                         tmp_file.write(job_desc_file.getvalue())
                         job_desc_path = tmp_file.name
                     raw_job_text = extract_text_from_file(job_desc_path)
-                    # 2. Handle multiple resume files by creating a temp directory for bulk loading
                     temp_dir = tempfile.mkdtemp()
                     for resume_file in resume_files:
-                        resume_path = os.path.join(temp_dir, resume_file.name)
-                        with open(resume_path, "wb") as f:
                             f.write(resume_file.getbuffer())
-                    # Bulk loading all resumes from the temp directory
                     raw_resume_texts = bulk_load_raw_resume_files(temp_dir)
-                    # 3. Call the appropriate model's pipeline based on the model choice (default to TF-IDF)
-                    if model_choice == "BERT":
-                        bert_model = get_bert_model()
-                        ranked_resumes, message = recruiter_bert(raw_job_text,
-                                                                 raw_resume_texts,
-                                                                 model=bert_model,
-                                                                 top_k=top_k)
-                    else:
-                        vectorizer = get_recruiter_vectorizer()
-                        ranked_resumes, message = recruiter_tfidf(raw_job_text,
-                                                                  raw_resume_texts,
-                                                                  vectorizer=vectorizer,
-                                                                  top_k=top_k)
-                    # 4. Display results
                     if not ranked_resumes:
                         st.warning("⚠️ Could not rank resumes. Please check the files.")
                     else:
-                        st.subheader(f"Top {len(ranked_resumes)} Ranked Resumes:")
                         st.info(message)
-                        df = pd.DataFrame(ranked_resumes, columns=["Resume", "Match Score"])
-                        df["Match Score"] = df["Match Score"].apply(lambda x: min(1.0, x))
                         st.dataframe(
                             df,
-                            column_config={"Resume": st.column_config.TextColumn("Resume"),
-                                           "Match Score": st.column_config.ProgressColumn("Match Score",
-                                                format="%.2f",
-                                                min_value=0,
-                                                max_value=1,),
-                                           },
-                                            width='stretch',
-                                            hide_index=True,
                         )
                 except Exception as e:
-                    st.error(f"⚠️An error occurred: {e}")
                 finally:
-                    # 5. Clean up all temporary files and the directory
                     if job_desc_path and os.path.exists(job_desc_path):
                         os.unlink(job_desc_path)
                     if temp_dir and os.path.exists(temp_dir):
-                        shutil.rmtree(temp_dir)

 import streamlit as st
 import os
+import sys
 import tempfile
 import pandas as pd
 import shutil
 import altair as alt
+import requests # Import for making API requests
+# Ensure the parent directory is in sys.path for imports
+PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+if PROJECT_ROOT not in sys.path:
+    sys.path.append(PROJECT_ROOT)
 from src.utils.bulk_loading import bulk_load_raw_resume_files
 from src.utils.file_reader import extract_text_from_file
+# Configuring the backend API URL
+API_URL = "http://127.0.0.1:8000"
+# Configuring the Streamlit app
 st.set_page_config(
     page_title="Resume-Job Matcher",
+    page_icon="👨‍💼",
     layout="wide"
 )
+# Main app title and description
 st.title("🎯 AI-Powered Resume-Job Matcher")
 st.write("---")
+# Creating sidebar for controls
 with st.sidebar:
     st.header("Controls")
     app_mode = st.radio(
     model_choice = st.selectbox(
         "Choose the AI Model",
         ("TF-IDF", "BERT"),
+        help="TF-IDF is baseline. BERT is more accurate and semantic."
     )
     st.write("---")
     show_all = st.checkbox("Show all matches", value=False)
     if show_all:
         top_k = None
         st.slider(
+            "Number of matches to show",
             min_value=1, max_value=50, value=5, step=1,
             disabled=True
         )
         st.info("Showing all ranked results.")
     else:
         top_k = st.slider(
+            "Number of matches to show",
             min_value=1, max_value=50, value=5, step=1,
             disabled=False
         )
+# Applicant view of the app
 if app_mode == "Applicant":
     st.header("Applicant: Match Your Resume to a Job")
     resume_file = st.file_uploader(
         "Upload your resume",
         type=['pdf', 'docx', 'txt'],
     if resume_file:
         st.success(f"✅ Successfully uploaded `{resume_file.name}`")
         if st.button("Find Top Job Matches", type="primary", width='stretch'):
+            with st.spinner(f"Sending your resume to the AI backend for matching..."):
                 tmp_file_path = None
                 try:
                     with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(resume_file.name)[1]) as tmp_file:
                         tmp_file.write(resume_file.getvalue())
                         tmp_file_path = tmp_file.name
                     raw_resume_text = extract_text_from_file(tmp_file_path)
+                    endpoint = f"{API_URL}/applicant/match/{model_choice.lower()}"
+                    payload = {"raw_text": raw_resume_text, "top_k": top_k}
+                    response = requests.post(endpoint, json=payload, timeout=180) # 3-minute timeout
+                    response.raise_for_status() # Raises HTTPError for bad responses eg. 4xx, 5xx
+                    api_data = response.json()
+                    matches = api_data.get("matches", [])
+                    message = api_data.get("message", "No message from server.")
                     if not matches:
                         st.warning("⚠️ No suitable job matches found.")
                     else:
                         st.info(message)
+                        st.subheader(f"Top {len(matches)} Job Matches:")
+                        df = pd.DataFrame(matches) # Pandas handles list of dicts perfectly
+                        df = df.sort_values(by="match_score", ascending=False).reset_index(drop=True)
                         chart = alt.Chart(df).mark_bar().encode(
+                            y=alt.Y('job_title', sort='-x', title=None, axis=alt.Axis(labelLimit=400)),
+                            x=alt.X('match_score', axis=None, scale=alt.Scale(domainMin=0)),
+                            tooltip=['job_title', alt.Tooltip('match_score', format='.3f')]
+                        ).properties(title="Relative Job Match Scores").interactive()
                         st.altair_chart(chart, use_container_width=True)
+                except requests.exceptions.RequestException as e:
+                    st.error(f"API Error: Could not connect to the backend. Please ensure the backend server is running. Details: {e}")
                 except Exception as e:
                     st.error(f"An error occurred: {e}")
                 finally:
                     if tmp_file_path and os.path.exists(tmp_file_path):
                         os.unlink(tmp_file_path)
+# Recruiter view of the app
 if app_mode == "Recruiter":
     st.header("Recruiter: Rank Resumes for a Job Description")
+    job_desc_file = st.file_uploader("Upload the job description", type=['pdf', 'docx', 'txt'])
+    resume_files = st.file_uploader("Upload candidate resumes", type=['pdf', 'docx', 'txt'], accept_multiple_files=True)
     if job_desc_file and resume_files:
+        st.success(f"✅ Successfully uploaded job description and {len(resume_files)} resumes.")
         if st.button("Rank Resumes", type="primary", width='stretch'):
+            with st.spinner(f"Sending files to the AI backend for ranking..."):
                 temp_dir = None
                 job_desc_path = None
                 try:
                     with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(job_desc_file.name)[1]) as tmp_file:
                         tmp_file.write(job_desc_file.getvalue())
                         job_desc_path = tmp_file.name
                     raw_job_text = extract_text_from_file(job_desc_path)
                     temp_dir = tempfile.mkdtemp()
                     for resume_file in resume_files:
+                        with open(os.path.join(temp_dir, resume_file.name), "wb") as f:
                             f.write(resume_file.getbuffer())
                     raw_resume_texts = bulk_load_raw_resume_files(temp_dir)
+                    endpoint = f"{API_URL}/recruiter/rank/{model_choice.lower()}"
+                    payload = {
+                        "raw_job_text": raw_job_text,
+                        "raw_resume_texts": raw_resume_texts,
+                        "top_k": top_k
+                    }
+                    response = requests.post(endpoint, json=payload, timeout=300) # 5-minute timeout
+                    response.raise_for_status() # Raises HTTPError for bad responses eg. 4xx, 5xx
+                    api_data = response.json()
+                    ranked_resumes = api_data.get("matches", [])
+                    message = api_data.get("message", "No message from server.")
                     if not ranked_resumes:
                         st.warning("⚠️ Could not rank resumes. Please check the files.")
                     else:
                         st.info(message)
+                        st.subheader(f"Top {len(ranked_resumes)} Ranked Resumes:")
+                        df = pd.DataFrame(ranked_resumes)
+                        df["match_score"] = df["match_score"].apply(lambda x: min(1.0, x))
                         st.dataframe(
                             df,
+                            column_config={
+                                "resume_filename": st.column_config.TextColumn("Resume"),
+                                "match_score": st.column_config.ProgressColumn(
+                                    "Match Score", format="%.2f", min_value=0, max_value=1
+                                ),
+                            },
+                            hide_index=True,
                         )
+                except requests.exceptions.RequestException as e:
+                    st.error(f"API Error: Could not connect to the backend. Please ensure the backend server is running. Details: {e}")
                 except Exception as e:
+                    st.error(f"An error occurred: {e}")
                 finally:
                     if job_desc_path and os.path.exists(job_desc_path):
                         os.unlink(job_desc_path)
                     if temp_dir and os.path.exists(temp_dir):
+                        shutil.rmtree(temp_dir)

pipelines/core/applicant.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import pandas as pd
 from pathlib import Path
 from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer, load_tfidf_matrix
 from src.feature_engg.bert_embedding_data import load_bert_model, load_faiss_index
 from src.processing.text_cleaning import clean_text, clean_text_for_bert
@@ -7,8 +9,22 @@ from src.matching.matching_engine import compute_similarity_matrix, top_n_tfidf_
 # Defining paths for data files
 PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
-def load_job_titles(job_csv_path: str):
-    df = pd.read_csv(job_csv_path)
     if "title" not in df.columns:
         raise ValueError("Job CSV must contain a 'title' column.")
     return df
@@ -16,6 +32,7 @@ def load_job_titles(job_csv_path: str):
 def run_tfidf_pipeline(raw_resume: str, *,
                        vectorizer=None,
                        job_matrix=None,
                        local_vectorizer_path=None,
                        local_matrix_path=None,
                        repo_id="Om-Shandilya/resume-matcher-tfidf",
@@ -28,7 +45,8 @@ def run_tfidf_pipeline(raw_resume: str, *,
     Args:
         raw_resume (str): Raw text of the resume.
         vectorizer (TfidfVectorizer, optional): Preloaded TF-IDF vectorizer.
-        job_matrix (scipy.sparse matrix, optional): Preloaded TF-IDF job matrix
         local_vectorizer_path (str, optional): Local path to TF-IDF vectorizer.
         local_matrix_path (str, optional): Local path to TF-IDF matrix.
         repo_id (str): Hugging Face repo ID for vectorizer/matrix.
@@ -51,7 +69,9 @@ def run_tfidf_pipeline(raw_resume: str, *,
     resume_vector = vectorizer.transform([cleaned_resume])
     sim_matrix = compute_similarity_matrix(resume_vector, job_matrix)
-    job_df = load_job_titles(PROJECT_ROOT / "data/app_data/tfidf_job_titles.csv")
     total_jobs = len(job_df['title'].unique())
     message = ""
@@ -81,6 +101,7 @@ def run_tfidf_pipeline(raw_resume: str, *,
 def run_bert_pipeline(raw_resume: str, *,
                       model=None,
                       job_index=None,
                       local_bert_path=None,
                       local_index_path=None,
                       repo_id="Om-Shandilya/resume-matcher-bert",
@@ -93,6 +114,7 @@ def run_bert_pipeline(raw_resume: str, *,
         raw_resume (str): Raw text of the resume.
         model (SentenceTransformer, optional): Preloaded BERT model.
         job_index (faiss.Index, optional): Preloaded FAISS index.
         local_bert_path (str, optional): Local path to BERT model.
         local_index_path (str, optional): Local path to FAISS index.
         repo_id (str): Hugging Face repo ID for model/index.
@@ -113,7 +135,10 @@ def run_bert_pipeline(raw_resume: str, *,
     resume_embedding = model.encode([cleaned_resume], normalize_embeddings=True)
     D, I = job_index.search(resume_embedding, job_index.ntotal)
-    job_df = load_job_titles(PROJECT_ROOT / "data/app_data/bert_job_titles.csv")
     total_jobs = len(job_df['title'].unique())
     message = ""

 import pandas as pd
+import os
 from pathlib import Path
+from huggingface_hub import hf_hub_download
 from src.feature_engg.tfidf_vectorizing_data import load_tfidf_vectorizer, load_tfidf_matrix
 from src.feature_engg.bert_embedding_data import load_bert_model, load_faiss_index
 from src.processing.text_cleaning import clean_text, clean_text_for_bert
 # Defining paths for data files
 PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
+def load_job_titles(local_path=None, repo_id=None, filename=None):
+    """
+    Load job titles, preferring a local path if provided, otherwise
+    downloading from the Hugging Face Hub.
+    """
+    file_path = ""
+    if local_path and os.path.exists(local_path):
+        print(f"📂 Using local job titles from {local_path}")
+        file_path = local_path
+    elif repo_id and filename:
+        print(f"🌐 Downloading job titles from Hugging Face Hub ({repo_id}/{filename})")
+        file_path = hf_hub_download(repo_id=repo_id, filename=filename)
+    else:
+        raise ValueError("Must provide either a valid local_path or repo_id and filename.")
+    df = pd.read_csv(file_path)
     if "title" not in df.columns:
         raise ValueError("Job CSV must contain a 'title' column.")
     return df
 def run_tfidf_pipeline(raw_resume: str, *,
                        vectorizer=None,
                        job_matrix=None,
+                       job_df=None,
                        local_vectorizer_path=None,
                        local_matrix_path=None,
                        repo_id="Om-Shandilya/resume-matcher-tfidf",
     Args:
         raw_resume (str): Raw text of the resume.
         vectorizer (TfidfVectorizer, optional): Preloaded TF-IDF vectorizer.
+        job_matrix (scipy.sparse matrix, optional): Preloaded TF-IDF job matrix.
+        job_df (pd.DataFrame, optional): DataFrame of job titles.
         local_vectorizer_path (str, optional): Local path to TF-IDF vectorizer.
         local_matrix_path (str, optional): Local path to TF-IDF matrix.
         repo_id (str): Hugging Face repo ID for vectorizer/matrix.
     resume_vector = vectorizer.transform([cleaned_resume])
     sim_matrix = compute_similarity_matrix(resume_vector, job_matrix)
+    if job_df is None:
+        job_df = load_job_titles(repo_id='Om-Shandilya/resume-matcher-tfidf', filename='applicant/tfidf_job_titles.csv')
     total_jobs = len(job_df['title'].unique())
     message = ""
 def run_bert_pipeline(raw_resume: str, *,
                       model=None,
                       job_index=None,
+                      job_df=None,
                       local_bert_path=None,
                       local_index_path=None,
                       repo_id="Om-Shandilya/resume-matcher-bert",
         raw_resume (str): Raw text of the resume.
         model (SentenceTransformer, optional): Preloaded BERT model.
         job_index (faiss.Index, optional): Preloaded FAISS index.
+        job_df (pd.DataFrame, optional): DataFrame of job titles.
         local_bert_path (str, optional): Local path to BERT model.
         local_index_path (str, optional): Local path to FAISS index.
         repo_id (str): Hugging Face repo ID for model/index.
     resume_embedding = model.encode([cleaned_resume], normalize_embeddings=True)
     D, I = job_index.search(resume_embedding, job_index.ntotal)
+    if job_df is None:
+        job_df = load_job_titles(repo_id='Om-Shandilya/resume-matcher-bert', filename='applicant/bert_job_titles.csv')
     total_jobs = len(job_df['title'].unique())
     message = ""

src/feature_engg/bert_embedding_data.py CHANGED Viewed

@@ -100,17 +100,30 @@ def bert_embed_text(df: pd.DataFrame,
     return embeddings, model
-def load_faiss_index(local_index_path: str, repo_id: str, filename: str):
-    """Load FAISS index, preferring local then HF Hub."""
     if local_index_path:
         if not os.path.exists(local_index_path):
             raise FileNotFoundError(f"❌ Local FAISS index not found at {local_index_path}")
-        print(f"📂 Loading local FAISS index from {local_index_path}")
-        return faiss.read_index(local_index_path)
-    print(f"🌐 Downloading FAISS index from Hugging Face Hub ({repo_id}/{filename})")
-    faiss_path = hf_hub_download(repo_id=repo_id, filename=filename)
-    return faiss.read_index(faiss_path)
 def load_bert_model(local_bert_path: str, repo_id: str='Om-Shandilya/resume-matcher-bert'):
     """

     return embeddings, model
+import faiss
+import os
+from huggingface_hub import hf_hub_download
+def load_faiss_index(local_index_path: str, repo_id: str, filename: str, lazy_loading: bool = True):
+    """
+    Load FAISS index, preferring local then HF Hub. Applies lazy loading by default.
+    """
+    index_path = ""
     if local_index_path:
         if not os.path.exists(local_index_path):
             raise FileNotFoundError(f"❌ Local FAISS index not found at {local_index_path}")
+        print(f"📂 Using local FAISS index from {local_index_path}")
+        index_path = local_index_path
+    else:
+        print(f"🌐 Downloading FAISS index from Hugging Face Hub ({repo_id}/{filename})")
+        index_path = hf_hub_download(repo_id=repo_id, filename=filename)
+    if lazy_loading:
+        print("   -> Loading with lazy loading (MMAP).")
+        return faiss.read_index(index_path, faiss.IO_FLAG_MMAP)
+    else:
+        print("   -> Loading into memory directly.")
+        return faiss.read_index(index_path)
 def load_bert_model(local_bert_path: str, repo_id: str='Om-Shandilya/resume-matcher-bert'):
     """

src/processing/text_cleaning.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import re
 import string
 import pandas as pd
 from typing import Optional
@@ -9,6 +10,8 @@ from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 import unicodedata
 # Only Download necessary NLTK resources if not already present
 nltk_packages = {
     "stopwords": "corpora/stopwords",

 import re
+import os
 import string
 import pandas as pd
 from typing import Optional
 from nltk.stem import WordNetLemmatizer
 import unicodedata
+nltk.data.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'nltk_data'))
 # Only Download necessary NLTK resources if not already present
 nltk_packages = {
     "stopwords": "corpora/stopwords",