Model save

Browse files

Files changed (5) hide show

README.md +68 -0
all_results.json +8 -0
generation_config.json +6 -0
train_results.json +8 -0
trainer_state.json +274 -0

README.md ADDED Viewed

	@@ -0,0 +1,68 @@

+---
+base_model: Qwen/Qwen2.5-Math-7B
+library_name: transformers
+model_name: Qwen2.5-Math-7B-gen8-math3to5-ghpo-cold0-3Dhint-prompt1-epoch1
+tags:
+- generated_from_trainer
+- trl
+- ghpo
+licence: license
+---
+# Model Card for Qwen2.5-Math-7B-gen8-math3to5-ghpo-cold0-3Dhint-prompt1-epoch1
+This model is a fine-tuned version of [Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="chenggong1995/Qwen2.5-Math-7B-gen8-math3to5-ghpo-cold0-3Dhint-prompt1-epoch1", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/gongc1995-city-university-of-hong-kong/huggingface/runs/j1gylbgn)
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+### Framework versions
+- TRL: 0.16.0
+- Transformers: 4.50.0
+- Pytorch: 2.5.1
+- Datasets: 3.5.0
+- Tokenizers: 0.21.1
+## Citations
+Cite GRPO as:
+```bibtex
+@article{zhihong2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+```
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

all_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": 0.12618592532375192,
+    "train_runtime": 35348.2691,
+    "train_samples": 8888,
+    "train_samples_per_second": 0.251,
+    "train_steps_per_second": 0.002
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "max_new_tokens": 2048,
+  "transformers_version": "4.50.0"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": 0.12618592532375192,
+    "train_runtime": 35348.2691,
+    "train_samples": 8888,
+    "train_samples_per_second": 0.251,
+    "train_steps_per_second": 0.002
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,274 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9952755905511811,
+  "eval_steps": 10000000000,
+  "global_step": 79,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 797.146240234375,
+      "epoch": 0.012598425196850394,
+      "grad_norm": 0.22265445878992737,
+      "learning_rate": 1.25e-07,
+      "loss": 0.201,
+      "num_tokens": 871723.0,
+      "reward": 0.5814732387661934,
+      "reward_std": 0.4103100262582302,
+      "rewards/accuracy_reward": 0.5747767761349678,
+      "rewards/format_reward": 0.01339285762514919,
+      "step": 1
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 804.7539443969727,
+      "epoch": 0.06299212598425197,
+      "grad_norm": 0.2417152167110338,
+      "learning_rate": 6.249999999999999e-07,
+      "loss": 0.1642,
+      "num_tokens": 4403713.0,
+      "reward": 0.5676618544384837,
+      "reward_std": 0.4300461960956454,
+      "rewards/accuracy_reward": 0.563895090483129,
+      "rewards/format_reward": 0.007533482450526208,
+      "step": 5
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 785.3382049560547,
+      "epoch": 0.12598425196850394,
+      "grad_norm": 0.3011761966028881,
+      "learning_rate": 9.980434110374724e-07,
+      "loss": 0.1628,
+      "num_tokens": 8734420.0,
+      "reward": 0.575000024586916,
+      "reward_std": 0.40654933378100394,
+      "rewards/accuracy_reward": 0.5716517880558968,
+      "rewards/format_reward": 0.006696428824216128,
+      "step": 10
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 807.4542770385742,
+      "epoch": 0.1889763779527559,
+      "grad_norm": 0.1895048136482896,
+      "learning_rate": 9.762072666790656e-07,
+      "loss": 0.1844,
+      "num_tokens": 13144543.0,
+      "reward": 0.5949777036905288,
+      "reward_std": 0.3915623873472214,
+      "rewards/accuracy_reward": 0.5912946447730064,
+      "rewards/format_reward": 0.007366071711294353,
+      "step": 15
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 804.9518280029297,
+      "epoch": 0.25196850393700787,
+      "grad_norm": 0.17095132727584114,
+      "learning_rate": 9.311572862600138e-07,
+      "loss": 0.1833,
+      "num_tokens": 17551799.0,
+      "reward": 0.6332589581608772,
+      "reward_std": 0.37318109199404714,
+      "rewards/accuracy_reward": 0.6314732164144516,
+      "rewards/format_reward": 0.00357142873108387,
+      "step": 20
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 797.7531616210938,
+      "epoch": 0.31496062992125984,
+      "grad_norm": 0.18012116491915073,
+      "learning_rate": 8.650895363529172e-07,
+      "loss": 0.1815,
+      "num_tokens": 21890173.0,
+      "reward": 0.6541294917464257,
+      "reward_std": 0.3551797144114971,
+      "rewards/accuracy_reward": 0.6547733508050442,
+      "rewards/format_reward": 0.0015625000698491931,
+      "step": 25
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 759.0582946777344,
+      "epoch": 0.3779527559055118,
+      "grad_norm": 0.22639137855339267,
+      "learning_rate": 7.812246438203903e-07,
+      "loss": 0.1692,
+      "num_tokens": 26076066.0,
+      "reward": 0.708482176065445,
+      "reward_std": 0.30925857946276664,
+      "rewards/accuracy_reward": 0.7082589268684387,
+      "rewards/format_reward": 0.00044642859138548373,
+      "step": 30
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 739.7466842651368,
+      "epoch": 0.4409448818897638,
+      "grad_norm": 0.37314514493140266,
+      "learning_rate": 6.836507988323784e-07,
+      "loss": 0.1384,
+      "num_tokens": 30202131.0,
+      "reward": 0.7066964626312255,
+      "reward_std": 0.2901428207755089,
+      "rewards/accuracy_reward": 0.7066964276134968,
+      "rewards/format_reward": 0.0,
+      "step": 35
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 772.0323989868164,
+      "epoch": 0.5039370078740157,
+      "grad_norm": 0.2614993046463775,
+      "learning_rate": 5.771244664826511e-07,
+      "loss": 0.1437,
+      "num_tokens": 34466708.0,
+      "reward": 0.6880580708384514,
+      "reward_std": 0.3077801916748285,
+      "rewards/accuracy_reward": 0.6879464246332645,
+      "rewards/format_reward": 0.00022321429569274187,
+      "step": 40
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 713.0567276000977,
+      "epoch": 0.5669291338582677,
+      "grad_norm": 0.1454705351797755,
+      "learning_rate": 4.6683852178244817e-07,
+      "loss": 0.0997,
+      "num_tokens": 38522522.0,
+      "reward": 0.7095982447266579,
+      "reward_std": 0.2782834365963936,
+      "rewards/accuracy_reward": 0.7095982141792774,
+      "rewards/format_reward": 0.0,
+      "step": 45
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 742.1245864868164,
+      "epoch": 0.6299212598425197,
+      "grad_norm": 0.14001943959476107,
+      "learning_rate": 3.5816911083285164e-07,
+      "loss": 0.0861,
+      "num_tokens": 42626344.0,
+      "reward": 0.7125000342726707,
+      "reward_std": 0.26790192127227785,
+      "rewards/accuracy_reward": 0.7125,
+      "rewards/format_reward": 0.0,
+      "step": 50
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 725.4094085693359,
+      "epoch": 0.6929133858267716,
+      "grad_norm": 0.11596792832277024,
+      "learning_rate": 2.5641357801960184e-07,
+      "loss": 0.0765,
+      "num_tokens": 46657290.0,
+      "reward": 0.712500037252903,
+      "reward_std": 0.2700365446507931,
+      "rewards/accuracy_reward": 0.7125000022351742,
+      "rewards/format_reward": 0.0,
+      "step": 55
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 716.4462371826172,
+      "epoch": 0.7559055118110236,
+      "grad_norm": 0.22234439564713598,
+      "learning_rate": 1.665322345816746e-07,
+      "loss": 0.0849,
+      "num_tokens": 50648721.0,
+      "reward": 0.7366071820259095,
+      "reward_std": 0.24836960211396217,
+      "rewards/accuracy_reward": 0.7366071425378322,
+      "rewards/format_reward": 0.0,
+      "step": 60
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 717.4730224609375,
+      "epoch": 0.8188976377952756,
+      "grad_norm": 0.182600444186133,
+      "learning_rate": 9.290655664821296e-08,
+      "loss": 0.0909,
+      "num_tokens": 54634528.0,
+      "reward": 0.7285714581608772,
+      "reward_std": 0.25840977653861047,
+      "rewards/accuracy_reward": 0.7285714313387871,
+      "rewards/format_reward": 0.0,
+      "step": 65
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 712.1507034301758,
+      "epoch": 0.8818897637795275,
+      "grad_norm": 0.17072451326668445,
+      "learning_rate": 3.912559994556086e-08,
+      "loss": 0.0883,
+      "num_tokens": 58570507.0,
+      "reward": 0.7503348544239998,
+      "reward_std": 0.25088600218296053,
+      "rewards/accuracy_reward": 0.7515796698629856,
+      "rewards/format_reward": 0.00022321429569274187,
+      "step": 70
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 719.5852981567383,
+      "epoch": 0.9448818897637795,
+      "grad_norm": 0.2933895566333781,
+      "learning_rate": 7.811042888637209e-09,
+      "loss": 0.0763,
+      "num_tokens": 62600049.0,
+      "reward": 0.7198661029338836,
+      "reward_std": 0.2631711885333061,
+      "rewards/accuracy_reward": 0.7198660731315613,
+      "rewards/format_reward": 0.0,
+      "step": 75
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 755.7718505859375,
+      "epoch": 0.9952755905511811,
+      "num_tokens": 65965046.0,
+      "reward": 0.7045201249420643,
+      "reward_std": 0.28055303543806076,
+      "rewards/accuracy_reward": 0.7042410708963871,
+      "rewards/format_reward": 0.0005580357392318547,
+      "step": 79,
+      "total_flos": 0.0,
+      "train_loss": 0.12618592532375192,
+      "train_runtime": 35348.2691,
+      "train_samples_per_second": 0.251,
+      "train_steps_per_second": 0.002
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 79,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}