from sentence_transformers import SentenceTransformer | |
from A_Preprocess import load_pdf_data | |
from E_Model_utils import batch_process_transformes_embeddings, get_embeddings, get_transformes_embeddings | |
from E_Faiss_utils import save_faiss_embeddings_index | |
from transformers import AutoTokenizer, AutoModel | |
import numpy as np | |
import os | |
import sys | |
from pathlib import Path | |
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) | |
BASE_DIR = Path(__file__).resolve().parent.parent | |
# Load and preprocess data | |
# old data_file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\Pager_Intents_cleaned.csv' | |
# old data = load_pdf_data(data_file_path) | |
# Load and preprocess data | |
file_name = 'InvoiceDetailsExplanation.csv' | |
data_file_path = BASE_DIR / "data" / file_name | |
data = load_pdf_data(str(data_file_path)) | |
sentences = data['utterance'].tolist() | |
tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-small") | |
# Load model from local path | |
model = AutoModel.from_pretrained("intfloat/multilingual-e5-small", local_files_only=True) | |
model_name = 'multilingual-e5-small' | |
# filter randomly only 100 sentences - for testing faster | |
# import random | |
# random.seed(42) | |
# random.shuffle(sentences) | |
# sentences = sentences[:100] | |
# ** Uncomment the following lines to load the model name ** | |
#model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') | |
#model_name = 'paraphrase-multilingual-MiniLM-L12-v2' | |
#model = SentenceTransformer('AlexHung29629/sgpt-llama3.2-1b-stage1') | |
#model_name = 'llama3.2-1b' | |
#model = SentenceTransformer('sentence-transformers/multilingual-e5-small') | |
#model_name = 'all-MiniLM-L6-v2' | |
# ** Uncomment the following lines to save the embeddings from sentence-transformers modeling ** | |
#embeddings = get_embeddings(model, sentences) | |
#save_faiss_embeddings_index(embeddings, file_name=f"embeddings/{model_name}_vector_db.index") | |
# print(f'Embeddings shape: {embeddings.shape}.') | |
# print(embeddings[:10]) | |
# ** Uncomment the following lines to save the embeddings from transformers modeling ** | |
# Load Romanian BERT model and tokenizer | |
#tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1") | |
#model = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1") | |
#model_name = 'bert-base-romanian-cased-v1' | |
# Require a lot of memory | |
#embeddings = get_transformes_embeddings(sentences, model, tokenizer) | |
# Using batch processing - for low memory | |
embeddings = batch_process_transformes_embeddings(sentences, model, tokenizer, batch_size=16, max_length=128) | |
save_faiss_embeddings_index(embeddings, file_name=f"{file_name}_{model_name}_vector_db.index") | |
print(f'Embeddings shape: {embeddings.shape}.') |