from sentence_transformers import SentenceTransformer, InputExample, losses from transformers import AutoTokenizer, AutoModel import pandas as pd from torch.utils.data import DataLoader import numpy as np from pathlib import Path import os import sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) BASE_DIR = Path(__file__).resolve().parent.parent def load_model(model_name): model = SentenceTransformer(model_name) return model def get_embeddings(model, texts): embeddings = model.encode(texts) #convert_to_tensor=True) return embeddings # Function to get embeddings from a pre-trained model - requires a lot of memory def get_transformes_embeddings(text, model, tokenizer): inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128) outputs = model(**inputs) # Mean pooling to get a single vector per sentence embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy() return embeddings # Using batch processing - for low memory def batch_process_transformes_embeddings(sentences, model, tokenizer, batch_size=16, max_length=128): all_embeddings = [] for i in range(0, len(sentences), batch_size): batch = sentences[i:i + batch_size] inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length) outputs = model(**inputs) embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy() all_embeddings.append(embeddings) return np.vstack(all_embeddings) def fine_tune_and_save_model(model_name, dataset): # Initialize the pre-trained model model = SentenceTransformer(model_name) # Create a list of InputExample objects train_examples = [InputExample(texts=[row['utterance'], row['intent']], label=1.0) for _, row in dataset.iterrows()] # Define a DataLoader train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16) # Define the loss function train_loss = losses.MultipleNegativesRankingLoss(model) # Fine-tune the model model.fit( train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100 ) # Save the fine-tuned model path = Path(BASE_DIR) / "output" / "fine-tuned-model" / model_name model.save(str(path)) # old model.save('output/fine-tuned-model/{model_name}') return model def load_model(model_path): model = SentenceTransformer(model_path) # SentenceTransformer.from_pretrained('output/fine-tuned-model') return model