|
from sentence_transformers import SentenceTransformer, InputExample, losses |
|
from transformers import AutoTokenizer, AutoModel |
|
import pandas as pd |
|
from torch.utils.data import DataLoader |
|
import numpy as np |
|
from pathlib import Path |
|
import os |
|
import sys |
|
|
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) |
|
BASE_DIR = Path(__file__).resolve().parent.parent |
|
|
|
|
|
def load_model(model_name): |
|
model = SentenceTransformer(model_name) |
|
return model |
|
|
|
def get_embeddings(model, texts): |
|
embeddings = model.encode(texts) |
|
return embeddings |
|
|
|
|
|
|
|
def get_transformes_embeddings(text, model, tokenizer): |
|
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128) |
|
outputs = model(**inputs) |
|
|
|
embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy() |
|
return embeddings |
|
|
|
|
|
def batch_process_transformes_embeddings(sentences, model, tokenizer, batch_size=16, max_length=128): |
|
all_embeddings = [] |
|
for i in range(0, len(sentences), batch_size): |
|
batch = sentences[i:i + batch_size] |
|
inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length) |
|
outputs = model(**inputs) |
|
embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy() |
|
all_embeddings.append(embeddings) |
|
return np.vstack(all_embeddings) |
|
|
|
|
|
def fine_tune_and_save_model(model_name, dataset): |
|
|
|
|
|
model = SentenceTransformer(model_name) |
|
|
|
|
|
train_examples = [InputExample(texts=[row['utterance'], row['intent']], label=1.0) for _, row in dataset.iterrows()] |
|
|
|
|
|
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16) |
|
|
|
|
|
|
|
train_loss = losses.MultipleNegativesRankingLoss(model) |
|
|
|
|
|
model.fit( |
|
train_objectives=[(train_dataloader, train_loss)], |
|
epochs=1, |
|
warmup_steps=100 |
|
) |
|
|
|
path = Path(BASE_DIR) / "output" / "fine-tuned-model" / model_name |
|
model.save(str(path)) |
|
|
|
|
|
|
|
|
|
return model |
|
|
|
def load_model(model_path): |
|
model = SentenceTransformer(model_path) |
|
return model |