|
import pandas as pd |
|
import re |
|
from sentence_transformers import SentenceTransformer |
|
import faiss |
|
import numpy as np |
|
|
|
def save_embeddings(embeddings, file_name): |
|
embeddings = embeddings.cpu().numpy() |
|
dimension = embeddings.shape[1] |
|
|
|
|
|
np.save(f"{file_name}_embeddings.npy", embeddings) |
|
index = faiss.IndexFlatL2(dimension) |
|
faiss.normalize_L2(embeddings) |
|
index.add(embeddings) |
|
faiss.write_index(index, file_name) |
|
|
|
return index |
|
|
|
def normalize_embeddings(embeddings): |
|
embeddings = embeddings.cpu().numpy() |
|
faiss.normalize_L2(embeddings) |
|
return embeddings |
|
|
|
def train_model(model_name): |
|
model = SentenceTransformer(model_name) |
|
return model |
|
|
|
def get_embeddings(model, texts): |
|
embeddings = model.encode(texts, convert_to_tensor=True) |
|
return embeddings |
|
|
|
|
|
def load_data(file_path): |
|
data = pd.read_csv(file_path) |
|
return data |
|
|
|
def clean_text(text): |
|
|
|
text = text.lower() |
|
text = re.sub(r'[^\w\s]', '', text) |
|
text = re.sub(r'\d+', '', text) |
|
text = text.strip() |
|
return text |
|
|
|
def preprocess_data(data): |
|
data['utterance'] = data['utterance'].apply(clean_text) |
|
return data |
|
|
|
|
|
|
|
data_file_path = r"C:\Users\serban.tica\Documents\Intent_detection\data\Pager_Intents_Recent.csv" |
|
data = load_data(data_file_path) |
|
data = preprocess_data(data) |
|
|
|
|
|
models = { |
|
"multilingual-e5-small":"intfloat/multilingual-e5-small" |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for model_name, model_path in models.items(): |
|
print(f"Processing model: {model_name}") |
|
model = train_model(model_path) |
|
texts = data['utterance'].tolist() |
|
embeddings = get_embeddings(model, texts) |
|
save_embeddings(embeddings, file_name=f"embeddings/{model_name}_vector_db.index") |
|
|