zswzswzsw
/

grpo_run_code

Model card Files Files and versions Community

zswzswzsw commited on 16 days ago

Commit

2a4552a

verified ·

1 Parent(s): ae40651

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

grpo_max_completion.py +248 -0
grpo_offline_run.py +11 -3

grpo_max_completion.py ADDED Viewed

	@@ -0,0 +1,248 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Supervised fine-tuning script for decoder language models.
+CUDA_VISIBLE_DEVICES=1,2,3,4,5 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml grpo_offline_run.py config_grpo_offline.yaml
+"""
+import logging
+import random
+import sys
+import datasets
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, set_seed
+from trl.data_utils import maybe_apply_chat_template
+from datasets import load_dataset
+from alignment import (
+    DataArguments,
+    H4ArgumentParser,
+    ModelArguments,
+    SFTConfig,
+    apply_chat_template,
+    decontaminate_humaneval,
+    get_checkpoint,
+    get_datasets,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+    get_tokenizer,
+)
+from trl import SFTTrainer, setup_chat_format
+from trl_012_grpo.grpo_trainer import GRPOTrainer
+from trl_012_grpo.grpo_config import GRPOConfig
+logger = logging.getLogger(__name__)
+def main():
+    parser = H4ArgumentParser((ModelArguments, DataArguments, GRPOConfig))
+    model_args, data_args, training_args = parser.parse()
+    # Set seed for reproducibility
+    set_seed(training_args.seed)
+    ###############
+    # Setup logging
+    ###############
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+    # Log on each process a small summary
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Model parameters {model_args}")
+    logger.info(f"Data parameters {data_args}")
+    logger.info(f"Training/evaluation parameters {training_args}")
+    # Check for last checkpoint
+    last_checkpoint = get_checkpoint(training_args)
+    if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+        logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")
+    ###############
+    # Load datasets
+    ###############
+    raw_datasets = load_dataset("json", data_files="/data01/swzhang/dataset/grpo_data_ori/grpo_del_lowscore/shuffle/grpo_test_shuffle.json")
+    eval_raw_datasets = load_dataset("json", data_files="/data01/swzhang/dataset/grpo_data_ori/grpo_del_lowscore/shuffle/grpo_test_shuffle.json")
+    logger.info(
+        f"Training on the following datasets and their proportions: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"
+    )
+    column_names = list(raw_datasets["train"].features)
+    ################
+    # Load tokenizer
+    ################
+    tokenizer = get_tokenizer(model_args, data_args)
+    #######################
+    # Load pretrained model
+    #######################
+    logger.info("*** Load pretrained model ***")
+    torch_dtype = (
+        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
+    )
+    quantization_config = get_quantization_config(model_args)
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        attn_implementation=model_args.attn_implementation,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+    model = model_args.model_name_or_path
+    # For ChatML we need to add special tokens and resize the embedding layer
+    if "<|im_start|>" in tokenizer.chat_template and "gemma-tokenizer-chatml" not in tokenizer.name_or_path:
+        model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs)
+        model, tokenizer = setup_chat_format(model, tokenizer)
+        model_kwargs = None
+    #####################
+    # Apply chat template
+    #####################
+    def truncate_string(text, max_length, tokenizer):
+        """
+        将字符串转换为 ID 列表，截断超过 max_length 的部分，再将剩余的 ID 转回字符串。
+        Args:
+            text (str): 输入的字符串
+            max_length (int): 最大允许的长度
+            tokenizer: 用于转换的 tokenizer
+        Returns:
+            str: 截断后的字符串
+        """
+        # 将字符串转换为 ID 列表
+        input_ids = tokenizer.encode(text, add_special_tokens=False)
+        # 截断 ID 列表
+        truncated_ids = input_ids[:max_length]
+        # 将截断后的 ID 列表转回字符串
+        truncated_text = tokenizer.decode(truncated_ids, skip_special_tokens=True)
+        print('截断前：',text)
+        print('截断后: ',truncated_text)
+        return truncated_text
+    def modify_completion(example):
+        # 将 completion 转换为列表
+        example['prompt'] = \
+        maybe_apply_chat_template({"prompt": [{"role": "user", "content": example['prompt']}]}, tokenizer=tokenizer)[
+            'prompt']
+        new_completions = []
+        for length,completion in zip(example['length'],example['completion']):
+            if length>training_args.max_completion_length:
+                completion = truncate_string(completion,training_args.max_completion_length,tokenizer)
+            new_completions.append(completion)
+        example['completion'] = new_completions
+        return example
+    raw_datasets = raw_datasets.map(modify_completion)
+    eval_raw_datasets = eval_raw_datasets.map(modify_completion)
+    train_dataset = raw_datasets["train"]
+    eval_dataset = eval_raw_datasets["train"]
+    ########################
+    # Initialize the Trainer
+    ########################
+    # 这里的reward function实际不会被用到
+    def reward_len(completions, **kwargs):
+        return [-abs(20 - len(completion)) for completion in completions]
+    training_args.model_init_kwargs = model_kwargs
+    trainer = GRPOTrainer(
+        model=model,
+        reward_funcs=reward_len,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+    )
+    ###############
+    # Training loop
+    ###############
+    logger.info("*** Train ***")
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+    train_result = trainer.train(resume_from_checkpoint=checkpoint)
+    metrics = train_result.metrics
+    metrics["train_samples"] = len(train_dataset)
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
+    ##################################
+    # Save model and create model card
+    ##################################
+    logger.info("*** Save model ***")
+    trainer.save_model(training_args.output_dir)
+    logger.info(f"Model saved to {training_args.output_dir}")
+    # Save everything else on main process
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "dataset": list(data_args.dataset_mixer.keys()),
+        "dataset_tags": list(data_args.dataset_mixer.keys()),
+        "tags": ["alignment-handbook"],
+    }
+    if trainer.accelerator.is_main_process:
+        trainer.create_model_card(**kwargs)
+        # Restore k,v cache for fast inference
+        trainer.model.config.use_cache = True
+        trainer.model.config.save_pretrained(training_args.output_dir)
+    ##########
+    # Evaluate
+    ##########
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        metrics["eval_samples"] = len(eval_dataset)
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+    if training_args.push_to_hub is True:
+        logger.info("Pushing to hub...")
+        trainer.push_to_hub(**kwargs)
+    logger.info("*** Training complete ***")
+if __name__ == "__main__":
+    main()

grpo_offline_run.py CHANGED Viewed

@@ -15,7 +15,7 @@
 # limitations under the License.
 """
 Supervised fine-tuning script for decoder language models.
-CUDA_VISIBLE_DEVICES=1,2,3,4,5 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml grpo_offline_run.py config_grpo_offline.yaml
 """
 import logging
@@ -27,7 +27,7 @@ import torch
 import transformers
 from transformers import AutoModelForCausalLM, set_seed
 from trl.data_utils import maybe_apply_chat_template
-from datasets import load_dataset
 from alignment import (
     DataArguments,
     H4ArgumentParser,
@@ -88,7 +88,15 @@ def main():
     ###############
     # Load datasets
     ###############
-    raw_datasets = load_dataset("json", data_files="/data01/swzhang/dataset/grpo_data_ori/grpo_del_lowscore/shuffle/grpo_test_shuffle.json")
     eval_raw_datasets = load_dataset("json", data_files="/data01/swzhang/dataset/grpo_data_ori/grpo_del_lowscore/shuffle/grpo_test_shuffle.json")
     logger.info(
         f"Training on the following datasets and their proportions: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"

 # limitations under the License.
 """
 Supervised fine-tuning script for decoder language models.
+CUDA_VISIBLE_DEVICES=2,3,4,5 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml grpo_offline_run.py config_grpo_offline.yaml
 """
 import logging
 import transformers
 from transformers import AutoModelForCausalLM, set_seed
 from trl.data_utils import maybe_apply_chat_template
+from datasets import load_dataset, Features, Value, Sequence
 from alignment import (
     DataArguments,
     H4ArgumentParser,
     ###############
     # Load datasets
     ###############
+    features = Features({
+        "prompt": Value("large_string"),  # prompt 字段可能较长，使用 large_string
+        "completion": Sequence(feature=Value("large_string")),  # completion 是字符串列表，使用 list<large_string>
+        "reward": Sequence(feature=Value("float32")),  # reward 是整数列表
+        "length": Sequence(feature=Value("int32")),  # length 是整数列表
+        "instruction_len": Value("int32"),  # instruction_len 是整数
+        "del_score": Value("float32")  # del_score 是浮点数
+    })
+    raw_datasets = load_dataset("json", data_files="/data01/swzhang/dataset/grpo_data_ori/grpo_del_lowscore/shuffle/grpo_train_shuffle.json",features=features)
     eval_raw_datasets = load_dataset("json", data_files="/data01/swzhang/dataset/grpo_data_ori/grpo_del_lowscore/shuffle/grpo_test_shuffle.json")
     logger.info(
         f"Training on the following datasets and their proportions: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"