|
from sentence_transformers import SentenceTransformer |
|
from A_Preprocess import load_pdf_data |
|
from E_Model_utils import get_embeddings |
|
import numpy as np |
|
import faiss |
|
|
|
|
|
data_file_path = r'C:\Users\serban.tica\Documents\tobi_llm_intent_recognition\data\Pager_Intents_Cleaned.csv' |
|
data = load_pdf_data(data_file_path) |
|
|
|
|
|
sentences = data['utterance'].tolist() |
|
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') |
|
model_name = 'all-MiniLM-L6-v2' |
|
embeddings = get_embeddings(model, sentences) |
|
print(f'Embeddings shape: {embeddings.shape}.') |
|
|
|
|
|
|
|
|
|
embeddings = np.array(embeddings).astype('float32') |
|
|
|
|
|
index = faiss.IndexFlatL2(embeddings.shape[1]) |
|
index.add(embeddings) |
|
|
|
|
|
faiss.write_index(index, f"{model_name}_faiss.index") |
|
|
|
|
|
index = faiss.read_index(f"{model_name}_faiss.index") |
|
|
|
|
|
|
|
|
|
query_embedding = 'cat am de platit la factura' |
|
query_embedding = np.array([embeddings[0]]).astype('float32') |
|
D, I = index.search(query_embedding, 5) |
|
print("Indices of nearest neighbors:", I) |
|
print("Distances of nearest neighbors:", D) |
|
|
|
|
|
|
|
|
|
|