|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import inspect |
|
import warnings |
|
from dataclasses import FrozenInstanceError, replace |
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union |
|
|
|
import torch |
|
import torch.nn as nn |
|
from datasets import Dataset |
|
from transformers import DataCollator, PreTrainedModel, PreTrainedTokenizerBase, Trainer, TrainingArguments |
|
from transformers.trainer_callback import TrainerCallback |
|
from transformers.trainer_pt_utils import nested_detach |
|
from transformers.trainer_utils import EvalPrediction |
|
|
|
from ..import_utils import is_peft_available |
|
from .reward_config import RewardConfig |
|
from .utils import RewardDataCollatorWithPadding, compute_accuracy |
|
|
|
|
|
if is_peft_available(): |
|
from peft import PeftModel, get_peft_model, prepare_model_for_kbit_training |
|
|
|
|
|
class RewardTrainer(Trainer): |
|
r""" |
|
The RewardTrainer can be used to train your custom Reward Model. It is a subclass of the |
|
`transformers.Trainer` class and inherits all of its attributes and methods. It is recommended to use |
|
an `AutoModelForSequenceClassification` as the reward model. The reward model should be trained on a dataset |
|
of paired examples, where each example is a tuple of two sequences. The reward model should be trained to |
|
predict which example in the pair is more relevant to the task at hand. |
|
|
|
The reward trainer expects a very specific format for the dataset. The dataset should contain two 4 entries at least |
|
if you don't use the default `RewardDataCollatorWithPadding` data collator. The entries should be named |
|
- `input_ids_chosen` |
|
- `attention_mask_chosen` |
|
- `input_ids_rejected` |
|
- `attention_mask_rejected` |
|
|
|
Optionally, you can also pass a `margin` entry to the dataset. This entry should contain the margin used to modulate the |
|
loss of the reward model as outlined in https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/. |
|
If you don't pass a margin, no margin will be used. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
model: Union[PreTrainedModel, nn.Module] = None, |
|
args: Optional[RewardConfig] = None, |
|
data_collator: Optional[DataCollator] = None, |
|
train_dataset: Optional[Dataset] = None, |
|
eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None, |
|
tokenizer: Optional[PreTrainedTokenizerBase] = None, |
|
model_init: Optional[Callable[[], PreTrainedModel]] = None, |
|
compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, |
|
callbacks: Optional[List[TrainerCallback]] = None, |
|
optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = ( |
|
None, |
|
None, |
|
), |
|
preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, |
|
max_length: Optional[int] = None, |
|
peft_config: Optional[Dict] = None, |
|
): |
|
""" |
|
Initialize RewardTrainer. |
|
|
|
Args: |
|
model (`transformers.PreTrainedModel`): |
|
The model to train, preferably an `AutoModelForSequenceClassification`. |
|
args (`RewardConfig`): |
|
The arguments to use for training. |
|
data_collator (`transformers.DataCollator`): |
|
The data collator to use for training. If None is specified, the default data collator (`RewardDataCollatorWithPadding`) will be used |
|
which will pad the sequences to the maximum length of the sequences in the batch, given a dataset of paired sequences. |
|
train_dataset (`datasets.Dataset`): |
|
The dataset to use for training. |
|
eval_dataset (`datasets.Dataset`): |
|
The dataset to use for evaluation. |
|
tokenizer (`transformers.PreTrainedTokenizerBase`): |
|
The tokenizer to use for training. This argument is required if you want to use the default data collator. |
|
model_init (`Callable[[], transformers.PreTrainedModel]`): |
|
The model initializer to use for training. If None is specified, the default model initializer will be used. |
|
compute_metrics (`Callable[[transformers.EvalPrediction], Dict]`, *optional* defaults to `compute_accuracy`): |
|
The metrics to use for evaluation. If no metrics are specified, the default metric (`compute_accuracy`) will be used. |
|
callbacks (`List[transformers.TrainerCallback]`): |
|
The callbacks to use for training. |
|
optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`): |
|
The optimizer and scheduler to use for training. |
|
preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`): |
|
The function to use to preprocess the logits before computing the metrics. |
|
max_length (`int`, defaults to `None`): |
|
The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator. |
|
peft_config (`Dict`, defaults to `None`): |
|
The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in a PEFT model. |
|
""" |
|
if type(args) == TrainingArguments: |
|
warnings.warn( |
|
"Using `transformers.TrainingArguments` for `args` is deprecated and will be removed in a future version. Please use `RewardConfig` instead.", |
|
FutureWarning, |
|
) |
|
if max_length is not None: |
|
warnings.warn( |
|
"The `max_length` argument is deprecated and will be removed in a future version. Please use the `RewardConfig` to set `max_length` instead.", |
|
FutureWarning, |
|
) |
|
else: |
|
if max_length is not None and args.max_length is not None: |
|
raise ValueError("You cannot specify both `max_length` and `args.max_length`. Please use the `RewardConfig` to set `max_length` once.") |
|
if max_length is not None and args.max_length is None: |
|
warnings.warn( |
|
"The `max_length` argument is deprecated and will be removed in a future version. Please use the `RewardConfig` to set `max_length` instead.", |
|
FutureWarning, |
|
) |
|
if not is_peft_available() and peft_config is not None: |
|
raise ValueError("PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models") |
|
elif is_peft_available() and peft_config is not None: |
|
if not isinstance(model, PeftModel): |
|
if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_quantized", False): |
|
_supports_gc_kwargs = "gradient_checkpointing_kwargs" in list(inspect.signature(prepare_model_for_kbit_training).parameters) |
|
|
|
preprare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing} |
|
|
|
if not _supports_gc_kwargs and args.gradient_checkpointing_kwargs is not None: |
|
warnings.warn("You passed `gradient_checkpointing_kwargs` in the trainer's kwargs, but your peft version does not support it. " "please update to the latest version of peft to use `gradient_checkpointing_kwargs`.") |
|
elif _supports_gc_kwargs and args.gradient_checkpointing_kwargs is not None: |
|
preprare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs |
|
|
|
model = prepare_model_for_kbit_training(model, **preprare_model_kwargs) |
|
|
|
model = get_peft_model(model, peft_config) |
|
|
|
if compute_metrics is None: |
|
compute_metrics = compute_accuracy |
|
|
|
if data_collator is None: |
|
if tokenizer is None: |
|
raise ValueError("max_length or a tokenizer must be specified when using the default RewardDataCollatorWithPadding") |
|
if type(args) == TrainingArguments: |
|
if max_length is None: |
|
warnings.warn( |
|
"When using RewardDataCollatorWithPadding, you should set `max_length` in RewardConfig." " It will be set to `512` by default, but you should do it yourself in the future.", |
|
UserWarning, |
|
) |
|
max_length = 512 |
|
else: |
|
if max_length is None and args.max_length is None: |
|
warnings.warn( |
|
"When using RewardDataCollatorWithPadding, you should set `max_length` in RewardConfig." " It will be set to `512` by default, but you should do it yourself in the future.", |
|
UserWarning, |
|
) |
|
max_length = 512 |
|
if max_length is None and args.max_length is not None: |
|
max_length = args.max_length |
|
|
|
data_collator = RewardDataCollatorWithPadding(tokenizer, max_length=max_length) |
|
|
|
if args.remove_unused_columns: |
|
try: |
|
args.remove_unused_columns = False |
|
except FrozenInstanceError: |
|
args = replace(args, remove_unused_columns=False) |
|
|
|
warnings.warn( |
|
"When using RewardDataCollatorWithPadding, you should set `remove_unused_columns=False` in your RewardConfig" " we have set it for you, but you should do it yourself in the future.", |
|
UserWarning, |
|
) |
|
|
|
self.use_reward_data_collator = True |
|
else: |
|
self.use_reward_data_collator = False |
|
super().__init__( |
|
model, |
|
args, |
|
data_collator, |
|
train_dataset, |
|
eval_dataset, |
|
tokenizer, |
|
model_init, |
|
compute_metrics, |
|
callbacks, |
|
optimizers, |
|
preprocess_logits_for_metrics, |
|
) |
|
|
|
def compute_loss( |
|
self, |
|
model: Union[PreTrainedModel, nn.Module], |
|
inputs: Dict[str, Union[torch.Tensor, Any]], |
|
return_outputs=False, |
|
) -> Union[torch.Tensor, Tuple[torch.Tensor, Dict[str, torch.Tensor]]]: |
|
if not self.use_reward_data_collator: |
|
warnings.warn("The current compute_loss is implemented for RewardDataCollatorWithPadding," " if you are using a custom data collator make sure you know what you are doing or" " implement your own compute_loss method.") |
|
rewards_chosen = model( |
|
input_ids=inputs["input_ids_chosen"], |
|
attention_mask=inputs["attention_mask_chosen"], |
|
return_dict=True, |
|
)["logits"] |
|
rewards_rejected = model( |
|
input_ids=inputs["input_ids_rejected"], |
|
attention_mask=inputs["attention_mask_rejected"], |
|
return_dict=True, |
|
)["logits"] |
|
|
|
if "margin" in inputs: |
|
loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected - inputs["margin"]).mean() |
|
else: |
|
loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected).mean() |
|
|
|
if return_outputs: |
|
return loss, { |
|
"rewards_chosen": rewards_chosen, |
|
"rewards_rejected": rewards_rejected, |
|
} |
|
return loss |
|
|
|
def prediction_step( |
|
self, |
|
model: Union[PreTrainedModel, nn.Module], |
|
inputs: Dict[str, Union[torch.Tensor, Any]], |
|
prediction_loss_only: bool, |
|
ignore_keys: Optional[List[str]] = None, |
|
) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: |
|
inputs = self._prepare_inputs(inputs) |
|
if ignore_keys is None: |
|
if hasattr(self.model, "config"): |
|
ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", []) |
|
else: |
|
ignore_keys = [] |
|
|
|
with torch.no_grad(): |
|
loss, logits_dict = self.compute_loss(model, inputs, return_outputs=True) |
|
|
|
if prediction_loss_only: |
|
return (loss, None, None) |
|
|
|
loss = loss.detach() |
|
logits = tuple(v for k, v in logits_dict.items() if k not in ignore_keys) |
|
logits = nested_detach(logits) |
|
|
|
|
|
logits = torch.stack(logits).mean(dim=2).softmax(dim=0).T |
|
|
|
labels = torch.zeros(logits.shape[0]) |
|
labels = self._prepare_inputs(labels) |
|
|
|
return loss, logits, labels |
|
|