HF-LLM-Intent-Detection / src /C_Train_and_save_embeddings.py
georgeek's picture
Transfer
5ecde30
from sentence_transformers import SentenceTransformer
from A_Preprocess import load_pdf_data
from E_Model_utils import batch_process_transformes_embeddings, get_embeddings, get_transformes_embeddings
from E_Faiss_utils import save_faiss_embeddings_index
from transformers import AutoTokenizer, AutoModel
import numpy as np
import os
import sys
from pathlib import Path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
BASE_DIR = Path(__file__).resolve().parent.parent
# Load and preprocess data
# old data_file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\Pager_Intents_cleaned.csv'
# old data = load_pdf_data(data_file_path)
# Load and preprocess data
file_name = 'InvoiceDetailsExplanation.csv'
data_file_path = BASE_DIR / "data" / file_name
data = load_pdf_data(str(data_file_path))
sentences = data['utterance'].tolist()
tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-small")
# Load model from local path
model = AutoModel.from_pretrained("intfloat/multilingual-e5-small", local_files_only=True)
model_name = 'multilingual-e5-small'
# filter randomly only 100 sentences - for testing faster
# import random
# random.seed(42)
# random.shuffle(sentences)
# sentences = sentences[:100]
# ** Uncomment the following lines to load the model name **
#model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
#model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
#model = SentenceTransformer('AlexHung29629/sgpt-llama3.2-1b-stage1')
#model_name = 'llama3.2-1b'
#model = SentenceTransformer('sentence-transformers/multilingual-e5-small')
#model_name = 'all-MiniLM-L6-v2'
# ** Uncomment the following lines to save the embeddings from sentence-transformers modeling **
#embeddings = get_embeddings(model, sentences)
#save_faiss_embeddings_index(embeddings, file_name=f"embeddings/{model_name}_vector_db.index")
# print(f'Embeddings shape: {embeddings.shape}.')
# print(embeddings[:10])
# ** Uncomment the following lines to save the embeddings from transformers modeling **
# Load Romanian BERT model and tokenizer
#tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
#model = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
#model_name = 'bert-base-romanian-cased-v1'
# Require a lot of memory
#embeddings = get_transformes_embeddings(sentences, model, tokenizer)
# Using batch processing - for low memory
embeddings = batch_process_transformes_embeddings(sentences, model, tokenizer, batch_size=16, max_length=128)
save_faiss_embeddings_index(embeddings, file_name=f"{file_name}_{model_name}_vector_db.index")
print(f'Embeddings shape: {embeddings.shape}.')