"""Embedding service for text embeddings and similarity search using GPU.""" import json import os from typing import Any, Dict, List, Optional, Tuple import faiss import numpy as np from sentence_transformers import SentenceTransformer from ..config import get_settings class EmbeddingService: """Service for text embeddings and similarity search.""" def __init__(self): self.settings = get_settings() self.model = None self.index = None self.job_metadata = {} self._load_model() self._load_index() def _load_model(self): """Load the sentence transformer model.""" try: self.model = SentenceTransformer(self.settings.embedding_model) # Use GPU if available if hasattr(self.model, "to"): import torch device = "cuda" if torch.cuda.is_available() else "cpu" self.model = self.model.to(device) print(f"Embedding model loaded on: {device}") except Exception as e: print(f"Error loading embedding model: {e}") self.model = None def _load_index(self): """Load or create FAISS index.""" try: if os.path.exists(self.settings.embeddings_cache_path): self.index = faiss.read_index(self.settings.embeddings_cache_path) # Load metadata metadata_path = self.settings.embeddings_cache_path.replace( ".faiss", "_metadata.json" ) if os.path.exists(metadata_path): with open(metadata_path, "r", encoding="utf-8") as f: self.job_metadata = json.load(f) print(f"Loaded FAISS index with {self.index.ntotal} vectors") else: # Create new index self.index = faiss.IndexFlatIP(self.settings.embedding_dimension) print("Created new FAISS index") except Exception as e: print(f"Error loading FAISS index: {e}") self.index = faiss.IndexFlatIP(self.settings.embedding_dimension) def _save_index(self): """Save FAISS index and metadata.""" try: os.makedirs( os.path.dirname(self.settings.embeddings_cache_path), exist_ok=True ) faiss.write_index(self.index, self.settings.embeddings_cache_path) # Save metadata metadata_path = self.settings.embeddings_cache_path.replace( ".faiss", "_metadata.json" ) with open(metadata_path, "w", encoding="utf-8") as f: json.dump(self.job_metadata, f, indent=2, default=str) print(f"Saved FAISS index with {self.index.ntotal} vectors") except Exception as e: print(f"Error saving FAISS index: {e}") def get_embedding(self, text: str) -> Optional[np.ndarray]: """Get embedding for a single text.""" if not self.model: return None try: embedding = self.model.encode([text]) # Normalize for cosine similarity (using Inner Product index) embedding = embedding / np.linalg.norm(embedding, axis=1, keepdims=True) return embedding[0] except Exception as e: print(f"Error generating embedding: {e}") return None def get_embeddings(self, texts: List[str]) -> Optional[np.ndarray]: """Get embeddings for multiple texts.""" if not self.model: return None try: embeddings = self.model.encode(texts, show_progress_bar=True) # Normalize for cosine similarity embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True) return embeddings except Exception as e: print(f"Error generating embeddings: {e}") return None def add_job_embeddings(self, jobs: List[Dict[str, Any]]): """Add job embeddings to the index.""" if not jobs: return # Prepare texts for embedding texts = [] job_ids = [] for job in jobs: # Combine job title, description, and requirements for embedding job_text = f"{job.get('title', '')} {job.get('description', '')} {job.get('requirements', '')}" texts.append(job_text) job_ids.append(job.get("id", len(self.job_metadata))) # Get embeddings embeddings = self.get_embeddings(texts) if embeddings is None: return # Add to index self.index.add(embeddings.astype("float32")) # Store metadata for i, job in enumerate(jobs): job_id = job_ids[i] self.job_metadata[str(len(self.job_metadata))] = { "job_id": job_id, "title": job.get("title", ""), "company": job.get("company", ""), "location": job.get("location", ""), "salary": job.get("salary", ""), "url": job.get("url", ""), "posted_date": job.get("posted_date", ""), "job_type": job.get("job_type", ""), "description": job.get("description", "")[:500], # Truncate for storage } self._save_index() print(f"Added {len(jobs)} job embeddings to index") def search_similar_jobs( self, profile_text: str, k: int = 20 ) -> List[Tuple[Dict[str, Any], float]]: """ Search for jobs similar to user profile. Args: profile_text: Combined user profile text for matching k: Number of top results to return Returns: List of tuples (job_metadata, similarity_score) """ if not self.index or self.index.ntotal == 0: return [] # Get profile embedding profile_embedding = self.get_embedding(profile_text) if profile_embedding is None: return [] # Search index try: scores, indices = self.index.search( profile_embedding[np.newaxis].astype("float32"), min(k, self.index.ntotal), ) results = [] for i, (score, idx) in enumerate(zip(scores[0], indices[0])): if idx == -1: # Invalid index continue job_meta = self.job_metadata.get(str(idx)) if job_meta: results.append((job_meta, float(score))) return results except Exception as e: print(f"Error searching similar jobs: {e}") return [] def clear_index(self): """Clear the index and metadata.""" self.index = faiss.IndexFlatIP(self.settings.embedding_dimension) self.job_metadata = {} self._save_index() print("Cleared job embeddings index") def get_index_stats(self) -> Dict[str, Any]: """Get statistics about the index.""" return { "total_jobs": self.index.ntotal if self.index else 0, "embedding_dimension": self.settings.embedding_dimension, "model_name": self.settings.embedding_model, "metadata_count": len(self.job_metadata), }