Saving and Loading the fine-tuned model
I fine-tuned the model on my data with SentenceTransformers
library, but obviously just model.save() does not work (it saves without errors, but when I reload in next session - I get aSome weights of BertModel were not initialized from the model checkpoint at ... and are newly initialized
)
Can you please help, how can I save and reload the model (ideally with SentenceTransformers
library)
Ok, I think I got a walk around:
!git clone https://huggingface.co/jinaai/jina-bert-implementation
!mv jina-bert-implementation jina_bert_implementation
!touch jina_bert_implementation/__init__.py
from jina_bert_implementation.modeling_bert import JinaBertModel
checkpoint = "my_checkpoint"
model = JinaBertModel.from_pretrained(checkpoint)
model.to(device)
from transformers import AutoTokenizer
import torch
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-en')
# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
encoded_input = {
key: val.to(device) for key, val in encoded_input.items()
}
# Compute token embeddings
with torch.no_grad():
model_output = model(**encoded_input)
# Perform pooling. In this case, max pooling.
sentences_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
Still would appreciate the help, because I want to load it as SentenceTransformer for ease of use.
hi @Maiia can you manually edit the SentenceTransformer
class, add trust_remote_code=True
when sbert doing the AutoModel.from_pretrained(...) thingy?
i think in SBert main branch they support it, not in the latest pypi release.
Was not able to find where to change it, but I adapted the function and created a class similar to SentenceTransformer (at least it does the encoding efficiently)
Maybe someone else finds it useful:
from tqdm.notebook import trange
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-en')
class JinaSentEmbedder(AutoModel):
def __init__(self, path):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = AutoModel.from_pretrained(
path,
trust_remote_code=True
)
self.model = self.model.to(self.device)
self.tokenize = AutoTokenizer.from_pretrained(
"jinaai/jina-embeddings-v2-base-en"
)
def _text_length(self, text):
if isinstance(text, dict): #{key: value} case
return len(next(iter(text.values())))
elif not hasattr(text, '__len__'): #Object has no len() method
return 1
elif len(text) == 0 or isinstance(text[0], int): #Empty string or list of ints
return len(text)
else:
return sum([len(t) for t in text])
def encode(self, sentences,
batch_size = 32,
show_progress_bar = None,
output_value: str = 'sentence_embedding',
convert_to_numpy: bool = True,
convert_to_tensor: bool = False,
device: str = None,
normalize_embeddings: bool = False):
self.model.eval()
if convert_to_tensor:
convert_to_numpy = False
if output_value != 'sentence_embedding':
convert_to_tensor = False
convert_to_numpy = False
input_was_string = False
if isinstance(sentences, str) or not hasattr(sentences, '__len__'): #Cast an individual sentence to a list with length 1
sentences = [sentences]
input_was_string = True
all_embeddings = []
length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences])
sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
sentences_batch = sentences_sorted[start_index:start_index+batch_size]
encoded_input = self.tokenize(sentences_batch, padding=True, truncation=True, return_tensors='pt')
encoded_input = {key: val.to(self.device) for key, val in encoded_input.items()}
with torch.no_grad():
model_output = self.model(**encoded_input)
sentences_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
all_embeddings.extend(sentences_embeddings)
all_embeddings = [all_embeddings[idx].cpu() for idx in np.argsort(length_sorted_idx)]
if convert_to_tensor:
all_embeddings = torch.stack(all_embeddings)
elif convert_to_numpy:
all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
if input_was_string:
all_embeddings = all_embeddings[0]
return all_embeddings
Hi @Maiia, could you please share the code you utilized for fine-tuning this model?
Thank you in advance!
@metalwhale
Hello, it's just normal SentenceTransformers fine-tuning, I have marked up pairs of phrases with labels (so phrase1, phrase2, label) where "label" can be either "pos" or "neg"
title_df = pl.DataFrame({
"title 1": [el[0] for el in dedup_negatives] + [el[0] for el in hard_positives],
'title 2': [el[1] for el in dedup_negatives] + [el[1] for el in hard_positives],
'label': ['neg'] * len(dedup_negatives) + ['pos'] * len(hard_positives)
})
for _ in range(5):
title_df = title_df.sample(fraction=1, shuffle=True)
train_df, val_df = train_test_split(title_df, random_state=42,
test_size=0.1,
stratify=title_df['label'].to_list())
train_examples = []
for row in train_df.iter_rows(named=True):
train_examples.append(
InputExample(texts=[row['title 1'], row['title 2']],
label=torch.tensor(1 if row['label'] == 'pos' else 0).to(torch.float32))
)
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)
sentences1 = val_df['title 1'].to_list()
sentences2 = val_df['title 2'].to_list()
scores = [torch.tensor(1 if el == 'pos' else 0).to(torch.float32) for el in val_df['label'].to_list()]
evaluator = EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1,
warmup_steps=len(train_dataloader)//10,
evaluator=evaluator, evaluation_steps=len(train_dataloader)//10)
@metalwhale I hope it helps, here is the documentation: https://www.sbert.net/docs/training/overview.html
@Maiia thank you so much for your kind help. I really appreciate it!
Hi @Maiia , I followed your step to add a JinaSentEmbedder class and load the jina model with the following code:
jina_path = './jina-embeddings-v2-base-code'
model = JinaSentEmbedder(jina_path)
...
model.fit(...)
It seems fit
is not a part of JinaSentEmbedder, I read the source code of sentence-transformers and found that fit
is implemented in SentenceTransformer
.
Does this mean I should copy a SentenceTransformer
class and patch the functions implemented in JinaSentEmbedder
? Or is there another way to load JinaSentEmbedder
as SentenceTransformer
?
This might be a dumb question as I am new to transformers etc. Thank you in advance!
@shijy16 a lot has happened since this issue was created. You can now load this model and finetune with it with normal Sentence Transformers. A complete training script may look like this:
import logging
from datasets import load_dataset, Dataset
from sentence_transformers import (
SentenceTransformer,
SentenceTransformerTrainer,
SentenceTransformerTrainingArguments,
SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import InformationRetrievalEvaluator
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO
)
# 1. Load a model to finetune with 2. (Optional) model card data
model = SentenceTransformer(
"jinaai/jina-embeddings-v2-base-en",
trust_remote_code=True,
model_card_data=SentenceTransformerModelCardData(
language="en",
license="apache-2.0",
model_name="jina-embeddings-v2-base-en trained on Natural Questions pairs",
),
)
model_name = "jina-v2-base-natural-questions"
# 3. Load a dataset to finetune on
dataset = load_dataset("sentence-transformers/natural-questions", split="train")
dataset = dataset.add_column("id", range(len(dataset)))
train_dataset: Dataset = dataset.select(range(90_000))
eval_dataset: Dataset = dataset.select(range(90_000, len(dataset)))
# 4. Define a loss function
loss = MultipleNegativesRankingLoss(model)
# 5. (Optional) Specify training arguments
args = SentenceTransformerTrainingArguments(
# Required parameter:
output_dir=f"models/{model_name}",
# Optional training parameters:
num_train_epochs=1,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
learning_rate=2e-5,
warmup_ratio=0.1,
fp16=False, # Set to False if you get an error that your GPU can't run on FP16
bf16=True, # Set to True if you have a GPU that supports BF16
batch_sampler=BatchSamplers.NO_DUPLICATES, # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
# Optional tracking/debugging parameters:
eval_strategy="steps",
eval_steps=200,
save_strategy="steps",
save_steps=200,
save_total_limit=2,
logging_steps=200,
logging_first_step=True,
run_name=model_name, # Will be used in W&B if `wandb` is installed
)
# 6. (Optional) Create an evaluator & evaluate the base model
# The full corpus, but only the evaluation queries
queries = dict(zip(eval_dataset["id"], eval_dataset["query"]))
corpus = {cid: dataset[cid]["answer"] for cid in range(10_000)} | {cid: dataset[cid]["answer"] for cid in eval_dataset["id"]}
relevant_docs = {qid: {qid} for qid in eval_dataset["id"]}
dev_evaluator = InformationRetrievalEvaluator(
corpus=corpus,
queries=queries,
relevant_docs=relevant_docs,
show_progress_bar=True,
name="natural-questions-dev",
batch_size=8,
)
dev_evaluator(model)
# 7. Create a trainer & train
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset=train_dataset.remove_columns("id"),
eval_dataset=eval_dataset.remove_columns("id"),
loss=loss,
evaluator=dev_evaluator,
)
trainer.train()
# (Optional) Evaluate the trained model on the evaluator after training
dev_evaluator(model)
# 8. Save the trained model
model.save_pretrained(f"models/{model_name}/final")
# 9. (Optional) Push it to the Hugging Face Hub
model.push_to_hub(f"{model_name}")
- Tom Aarsen
@shijy16 a lot has happened since this issue was created. You can now load this model and finetune with it with normal Sentence Transformers. A complete training script may look like this:
import logging from datasets import load_dataset, Dataset from sentence_transformers import ( SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments, SentenceTransformerModelCardData, ) from sentence_transformers.losses import MultipleNegativesRankingLoss from sentence_transformers.training_args import BatchSamplers from sentence_transformers.evaluation import InformationRetrievalEvaluator logging.basicConfig( format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO ) # 1. Load a model to finetune with 2. (Optional) model card data model = SentenceTransformer( "jinaai/jina-embeddings-v2-base-en", trust_remote_code=True, model_card_data=SentenceTransformerModelCardData( language="en", license="apache-2.0", model_name="jina-embeddings-v2-base-en trained on Natural Questions pairs", ), ) model_name = "jina-v2-base-natural-questions" # 3. Load a dataset to finetune on dataset = load_dataset("sentence-transformers/natural-questions", split="train") dataset = dataset.add_column("id", range(len(dataset))) train_dataset: Dataset = dataset.select(range(90_000)) eval_dataset: Dataset = dataset.select(range(90_000, len(dataset))) # 4. Define a loss function loss = MultipleNegativesRankingLoss(model) # 5. (Optional) Specify training arguments args = SentenceTransformerTrainingArguments( # Required parameter: output_dir=f"models/{model_name}", # Optional training parameters: num_train_epochs=1, per_device_train_batch_size=16, per_device_eval_batch_size=16, learning_rate=2e-5, warmup_ratio=0.1, fp16=False, # Set to False if you get an error that your GPU can't run on FP16 bf16=True, # Set to True if you have a GPU that supports BF16 batch_sampler=BatchSamplers.NO_DUPLICATES, # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch # Optional tracking/debugging parameters: eval_strategy="steps", eval_steps=200, save_strategy="steps", save_steps=200, save_total_limit=2, logging_steps=200, logging_first_step=True, run_name=model_name, # Will be used in W&B if `wandb` is installed ) # 6. (Optional) Create an evaluator & evaluate the base model # The full corpus, but only the evaluation queries queries = dict(zip(eval_dataset["id"], eval_dataset["query"])) corpus = {cid: dataset[cid]["answer"] for cid in range(10_000)} | {cid: dataset[cid]["answer"] for cid in eval_dataset["id"]} relevant_docs = {qid: {qid} for qid in eval_dataset["id"]} dev_evaluator = InformationRetrievalEvaluator( corpus=corpus, queries=queries, relevant_docs=relevant_docs, show_progress_bar=True, name="natural-questions-dev", batch_size=8, ) dev_evaluator(model) # 7. Create a trainer & train trainer = SentenceTransformerTrainer( model=model, args=args, train_dataset=train_dataset.remove_columns("id"), eval_dataset=eval_dataset.remove_columns("id"), loss=loss, evaluator=dev_evaluator, ) trainer.train() # (Optional) Evaluate the trained model on the evaluator after training dev_evaluator(model) # 8. Save the trained model model.save_pretrained(f"models/{model_name}/final") # 9. (Optional) Push it to the Hugging Face Hub model.push_to_hub(f"{model_name}")
- Tom Aarsen
Can I apply the same for thge new version of Jina Embeddings v3 ?
@shijy16 a lot has happened since this issue was created. You can now load this model and finetune with it with normal Sentence Transformers. A complete training script may look like this:
import logging from datasets import load_dataset, Dataset from sentence_transformers import ( SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments, SentenceTransformerModelCardData, ) from sentence_transformers.losses import MultipleNegativesRankingLoss from sentence_transformers.training_args import BatchSamplers from sentence_transformers.evaluation import InformationRetrievalEvaluator logging.basicConfig( format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO ) # 1. Load a model to finetune with 2. (Optional) model card data model = SentenceTransformer( "jinaai/jina-embeddings-v2-base-en", trust_remote_code=True, model_card_data=SentenceTransformerModelCardData( language="en", license="apache-2.0", model_name="jina-embeddings-v2-base-en trained on Natural Questions pairs", ), ) model_name = "jina-v2-base-natural-questions" # 3. Load a dataset to finetune on dataset = load_dataset("sentence-transformers/natural-questions", split="train") dataset = dataset.add_column("id", range(len(dataset))) train_dataset: Dataset = dataset.select(range(90_000)) eval_dataset: Dataset = dataset.select(range(90_000, len(dataset))) # 4. Define a loss function loss = MultipleNegativesRankingLoss(model) # 5. (Optional) Specify training arguments args = SentenceTransformerTrainingArguments( # Required parameter: output_dir=f"models/{model_name}", # Optional training parameters: num_train_epochs=1, per_device_train_batch_size=16, per_device_eval_batch_size=16, learning_rate=2e-5, warmup_ratio=0.1, fp16=False, # Set to False if you get an error that your GPU can't run on FP16 bf16=True, # Set to True if you have a GPU that supports BF16 batch_sampler=BatchSamplers.NO_DUPLICATES, # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch # Optional tracking/debugging parameters: eval_strategy="steps", eval_steps=200, save_strategy="steps", save_steps=200, save_total_limit=2, logging_steps=200, logging_first_step=True, run_name=model_name, # Will be used in W&B if `wandb` is installed ) # 6. (Optional) Create an evaluator & evaluate the base model # The full corpus, but only the evaluation queries queries = dict(zip(eval_dataset["id"], eval_dataset["query"])) corpus = {cid: dataset[cid]["answer"] for cid in range(10_000)} | {cid: dataset[cid]["answer"] for cid in eval_dataset["id"]} relevant_docs = {qid: {qid} for qid in eval_dataset["id"]} dev_evaluator = InformationRetrievalEvaluator( corpus=corpus, queries=queries, relevant_docs=relevant_docs, show_progress_bar=True, name="natural-questions-dev", batch_size=8, ) dev_evaluator(model) # 7. Create a trainer & train trainer = SentenceTransformerTrainer( model=model, args=args, train_dataset=train_dataset.remove_columns("id"), eval_dataset=eval_dataset.remove_columns("id"), loss=loss, evaluator=dev_evaluator, ) trainer.train() # (Optional) Evaluate the trained model on the evaluator after training dev_evaluator(model) # 8. Save the trained model model.save_pretrained(f"models/{model_name}/final") # 9. (Optional) Push it to the Hugging Face Hub model.push_to_hub(f"{model_name}")
- Tom Aarsen
I have used this code to finetune the jinaai/jina-embeddings-v2-base-de model, but when I load it again using
from transformers import AutoTokenizer, AutoModel, AutoConfig
model = AutoModel.from_pretrained(f"models/{model_name}/final")
tokenizer = AutoTokenizer.from_pretrained(f"models/{model_name}/final")
then I get
Some weights of BertModel were not initialized from the model checkpoint at models/jina-v2-base-natural-questions/final and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.10.intermediate.dense.bias', 'encoder.layer.10.intermediate.dense.weight', 'encoder.layer.10.output.LayerNorm.bias', 'encoder.layer.10.output.LayerNorm.weight', 'encoder.layer.10.output.dense.bias', 'encoder.layer.10.output.dense.weight', 'encoder.layer.11.intermediate.dense.bias', 'encoder.layer.11.intermediate.dense.weight', 'encoder.layer.11.output.LayerNorm.bias', 'encoder.layer.11.output.LayerNorm.weight', 'encoder.layer.11.output.dense.bias', 'encoder.layer.11.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.dense.weight', 'encoder.layer.3.output.LayerNorm.bias', 'encoder.layer.3.output.LayerNorm.weight', 'encoder.layer.3.output.dense.bias', 'encoder.layer.3.output.dense.weight', 'encoder.layer.4.intermediate.dense.bias', 'encoder.layer.4.intermediate.dense.weight', 'encoder.layer.4.output.LayerNorm.bias', 'encoder.layer.4.output.LayerNorm.weight', 'encoder.layer.4.output.dense.bias', 'encoder.layer.4.output.dense.weight', 'encoder.layer.5.intermediate.dense.bias', 'encoder.layer.5.intermediate.dense.weight', 'encoder.layer.5.output.LayerNorm.bias', 'encoder.layer.5.output.LayerNorm.weight', 'encoder.layer.5.output.dense.bias', 'encoder.layer.5.output.dense.weight', 'encoder.layer.6.intermediate.dense.bias', 'encoder.layer.6.intermediate.dense.weight', 'encoder.layer.6.output.LayerNorm.bias', 'encoder.layer.6.output.LayerNorm.weight', 'encoder.layer.6.output.dense.bias', 'encoder.layer.6.output.dense.weight', 'encoder.layer.7.intermediate.dense.bias', 'encoder.layer.7.intermediate.dense.weight', 'encoder.layer.7.output.LayerNorm.bias', 'encoder.layer.7.output.LayerNorm.weight', 'encoder.layer.7.output.dense.bias', 'encoder.layer.7.output.dense.weight', 'encoder.layer.8.intermediate.dense.bias', 'encoder.layer.8.intermediate.dense.weight', 'encoder.layer.8.output.LayerNorm.bias', 'encoder.layer.8.output.LayerNorm.weight', 'encoder.layer.8.output.dense.bias', 'encoder.layer.8.output.dense.weight', 'encoder.layer.9.intermediate.dense.bias', 'encoder.layer.9.intermediate.dense.weight', 'encoder.layer.9.output.LayerNorm.bias', 'encoder.layer.9.output.LayerNorm.weight', 'encoder.layer.9.output.dense.bias', 'encoder.layer.9.output.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
without changing anything else about the model or config, besides reducing the number of data points to fit it into my VRAM.
Is this intended behaviour or do I need to change the way I load the model?