File size: 2,231 Bytes
5ecde30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas as pd
from E_Model_utils import fine_tune_and_save_model
from sentence_transformers import SentenceTransformer
from A_Preprocess import load_pdf_data
from pathlib import Path

# Load the dataset from BASE_DIR
BASE_DIR = Path(__file__).resolve().parents[1]

data_file_path = BASE_DIR / "data" / "Pager_Intents_cleaned.csv"

print(data_file_path)

# Load the data
data = load_pdf_data(str(data_file_path))
# OLDPATH  data = load_pdf_data(r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\Pager_Intents_cleaned.csv')

# Specify the model name
# 'intfloat/multilingual-e5-small'
# 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
# 'McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp' #llama
#  "multilingual-e5-small":"intfloat/multilingual-e5-small", "all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2", "all-mpnet-base-v2":"sentence-transformers/all-mpnet-base-v2"
 #"bert-base-nli-mean-tokens":"sentence-transformers/bert-base-nli-mean-tokens", #"all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2", "all-distilroberta-v1":"sentence-transformers/all-distilroberta-v1"
# 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
# "all-mpnet-base-v2":"sentence-transformers/all-mpnet-base-v2",
# "bert-base-nli":"sentence-transformers/bert-base-nli-mean-tokens",
# "all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
# "all-distilroberta-v1":"sentence-transformers/all-distilroberta-v1"
# "bert-base-romanian-cased-v1": "sentence-transformers/bert-base-romanian-cased-v1",
# "bert-base-romanian-uncased-v1": "sentence-transformers/dumitrescustefan/bert-base-romanian-uncased-v1",
#"mBERT": "bert-base-multilingual-cased", "XLM-R": "xlm-roberta-base", "Romanian BERT": "dumitrescustefan/bert-base-romanian-cased-v1", "dumitrescustefan/bert-base-romanian-uncased-v1": "dumitrescustefan/bert-base-romanian-uncased-v1"
# Generate and save embeddings for each model, "xlm-r-distilroberta-base-paraphrase-v1"
# 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
# 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
model_name = 'BlackKakapo/stsb-xlm-r-multilingual-ro'


# Fine-tune and save the model
fine_tune_and_save_model(model_name, data)