File size: 2,597 Bytes
5ecde30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from sentence_transformers import SentenceTransformer, InputExample, losses
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from torch.utils.data import DataLoader
import numpy as np
from pathlib import Path
import os
import sys    

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
BASE_DIR = Path(__file__).resolve().parent.parent


def load_model(model_name):
    model = SentenceTransformer(model_name)
    return model

def get_embeddings(model, texts):
    embeddings = model.encode(texts) #convert_to_tensor=True)
    return embeddings


# Function to get embeddings from a pre-trained model - requires a lot of memory
def get_transformes_embeddings(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    outputs = model(**inputs)
    # Mean pooling to get a single vector per sentence
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embeddings

# Using batch processing - for low memory
def batch_process_transformes_embeddings(sentences, model, tokenizer, batch_size=16, max_length=128):
    all_embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
        all_embeddings.append(embeddings)
    return np.vstack(all_embeddings)


def fine_tune_and_save_model(model_name, dataset):
    
    # Initialize the pre-trained model
    model = SentenceTransformer(model_name)

    # Create a list of InputExample objects
    train_examples = [InputExample(texts=[row['utterance'], row['intent']], label=1.0) for _, row in dataset.iterrows()]

    # Define a DataLoader
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)


    # Define the loss function
    train_loss = losses.MultipleNegativesRankingLoss(model)
    
    # Fine-tune the model
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=1,
        warmup_steps=100
    )
    # Save the fine-tuned model
    path = Path(BASE_DIR) / "output" / "fine-tuned-model" / model_name
    model.save(str(path))

    # old model.save('output/fine-tuned-model/{model_name}')


    return model

def load_model(model_path):
    model = SentenceTransformer(model_path) # SentenceTransformer.from_pretrained('output/fine-tuned-model')
    return model