File size: 1,399 Bytes
5ecde30 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
from sentence_transformers import SentenceTransformer
from A_Preprocess import load_pdf_data
from E_Model_utils import get_embeddings
import numpy as np
import faiss
# Load and preprocess data
data_file_path = r'C:\Users\serban.tica\Documents\tobi_llm_intent_recognition\data\Pager_Intents_Cleaned.csv'
data = load_pdf_data(data_file_path)
sentences = data['utterance'].tolist()
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
model_name = 'all-MiniLM-L6-v2'
embeddings = get_embeddings(model, sentences)
print(f'Embeddings shape: {embeddings.shape}.')
#save embeddings as faiss index
# Convert embeddings to float32
embeddings = np.array(embeddings).astype('float32')
# Create a FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1]) # L2 distance
index.add(embeddings)
# Save the FAISS index
faiss.write_index(index, f"{model_name}_faiss.index")
# Load the FAISS index (for later use)
index = faiss.read_index(f"{model_name}_faiss.index")
# To query the index, you can use the search method
# Example: Find the 5 nearest neighbors of a query embedding
query_embedding = 'cat am de platit la factura'
query_embedding = np.array([embeddings[0]]).astype('float32') # Example query
D, I = index.search(query_embedding, 5) # D: distances, I: indices
print("Indices of nearest neighbors:", I)
print("Distances of nearest neighbors:", D)
#print(embeddings[:10]) |