from sentence_transformers import SentenceTransformer from A_Preprocess import load_pdf_data from E_Model_utils import get_embeddings import numpy as np import faiss # Load and preprocess data data_file_path = r'C:\Users\serban.tica\Documents\tobi_llm_intent_recognition\data\Pager_Intents_Cleaned.csv' data = load_pdf_data(data_file_path) sentences = data['utterance'].tolist() model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') model_name = 'all-MiniLM-L6-v2' embeddings = get_embeddings(model, sentences) print(f'Embeddings shape: {embeddings.shape}.') #save embeddings as faiss index # Convert embeddings to float32 embeddings = np.array(embeddings).astype('float32') # Create a FAISS index index = faiss.IndexFlatL2(embeddings.shape[1]) # L2 distance index.add(embeddings) # Save the FAISS index faiss.write_index(index, f"{model_name}_faiss.index") # Load the FAISS index (for later use) index = faiss.read_index(f"{model_name}_faiss.index") # To query the index, you can use the search method # Example: Find the 5 nearest neighbors of a query embedding query_embedding = 'cat am de platit la factura' query_embedding = np.array([embeddings[0]]).astype('float32') # Example query D, I = index.search(query_embedding, 5) # D: distances, I: indices print("Indices of nearest neighbors:", I) print("Distances of nearest neighbors:", D) #print(embeddings[:10])