Model save

Browse files

Files changed (6) hide show

README.md +67 -0
all_results.json +8 -0
config.json +1 -1
generation_config.json +14 -0
train_results.json +8 -0
trainer_state.json +1443 -0

README.md ADDED Viewed

	@@ -0,0 +1,67 @@

+---
+library_name: transformers
+model_name: Qwen2.5-0.5B-Instruct_grpo_Countdown-Tasks-3to4
+tags:
+- generated_from_trainer
+- trl
+- grpo
+licence: license
+---
+# Model Card for Qwen2.5-0.5B-Instruct_grpo_Countdown-Tasks-3to4
+This model is a fine-tuned version of [None](https://huggingface.co/None).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="weege007/Qwen2.5-0.5B-Instruct_grpo_Countdown-Tasks-3to4", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/weege007/huggingface/runs/qklw2es4)
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+### Framework versions
+- TRL: 0.18.0
+- Transformers: 4.52.4
+- Pytorch: 2.6.0+cu124
+- Datasets: 3.3.2
+- Tokenizers: 0.21.1
+## Citations
+Cite GRPO as:
+```bibtex
+@article{zhihong2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+```
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

all_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": 8.04626297735922e-06,
+    "train_runtime": 2640.6632,
+    "train_samples": 441327,
+    "train_samples_per_second": 0.303,
+    "train_steps_per_second": 0.038
+}

config.json CHANGED Viewed

@@ -22,7 +22,7 @@
   "tie_word_embeddings": true,
   "torch_dtype": "float32",
   "transformers_version": "4.52.4",
-  "use_cache": false,
   "use_sliding_window": false,
   "vocab_size": 151936
 }

   "tie_word_embeddings": true,
   "torch_dtype": "float32",
   "transformers_version": "4.52.4",
+  "use_cache": true,
   "use_sliding_window": false,
   "vocab_size": 151936
 }

generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.52.4"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": 8.04626297735922e-06,
+    "train_runtime": 2640.6632,
+    "train_samples": 441327,
+    "train_samples_per_second": 0.303,
+    "train_steps_per_second": 0.038
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1443 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.00022658935437895256,
+  "eval_steps": 500,
+  "global_step": 100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 656.0,
+      "completions/max_terminated_length": 222.5,
+      "completions/mean_length": 194.0625,
+      "completions/mean_terminated_length": 136.94643020629883,
+      "completions/min_length": 61.5,
+      "completions/min_terminated_length": 61.5,
+      "epoch": 4.531787087579051e-06,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 12.525100708007812,
+      "kl": 0.0,
+      "learning_rate": 1.6666666666666665e-07,
+      "loss": -0.0,
+      "num_tokens": 5377.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0625,
+      "rewards/format_reward_func/std": 0.1767766922712326,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 618.0,
+      "completions/max_terminated_length": 198.0,
+      "completions/mean_length": 171.25,
+      "completions/mean_terminated_length": 114.93750381469727,
+      "completions/min_length": 57.5,
+      "completions/min_terminated_length": 57.5,
+      "epoch": 9.063574175158102e-06,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.001167318900115788,
+      "kl": 0.0009191570134134963,
+      "learning_rate": 5e-07,
+      "loss": 0.0,
+      "num_tokens": 10397.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0625,
+      "rewards/format_reward_func/std": 0.1767766922712326,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 477.0,
+      "completions/max_terminated_length": 477.0,
+      "completions/mean_length": 175.5,
+      "completions/mean_terminated_length": 175.5,
+      "completions/min_length": 38.5,
+      "completions/min_terminated_length": 38.5,
+      "epoch": 1.3595361262737154e-05,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0017322486964985728,
+      "kl": 0.0009386140663991682,
+      "learning_rate": 4.994757065594279e-07,
+      "loss": 0.0,
+      "num_tokens": 15493.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 307.0,
+      "completions/max_terminated_length": 307.0,
+      "completions/mean_length": 156.25,
+      "completions/mean_terminated_length": 156.25,
+      "completions/min_length": 47.0,
+      "completions/min_terminated_length": 47.0,
+      "epoch": 1.8127148350316204e-05,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 11.28031063079834,
+      "kl": 0.0007646345866305637,
+      "learning_rate": 4.979050253066063e-07,
+      "loss": 0.0,
+      "num_tokens": 20313.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0625,
+      "rewards/format_reward_func/std": 0.1767766922712326,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 312.5,
+      "completions/max_terminated_length": 312.5,
+      "completions/mean_length": 152.25,
+      "completions/mean_terminated_length": 152.25,
+      "completions/min_length": 65.0,
+      "completions/min_terminated_length": 65.0,
+      "epoch": 2.2658935437895258e-05,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 9.902769088745117,
+      "kl": 0.0008557607743568951,
+      "learning_rate": 4.952945442245597e-07,
+      "loss": 0.0,
+      "num_tokens": 25061.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0625,
+      "rewards/format_reward_func/std": 0.1767766922712326,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 543.5,
+      "completions/max_terminated_length": 543.5,
+      "completions/mean_length": 164.4375,
+      "completions/mean_terminated_length": 164.4375,
+      "completions/min_length": 38.0,
+      "completions/min_terminated_length": 38.0,
+      "epoch": 2.7190722525474308e-05,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0027156081050634384,
+      "kl": 0.0010605865281831939,
+      "learning_rate": 4.916552125781528e-07,
+      "loss": 0.0,
+      "num_tokens": 29980.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0625,
+      "rewards/format_reward_func/std": 0.1767766922712326,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 451.0,
+      "completions/max_terminated_length": 451.0,
+      "completions/mean_length": 211.0,
+      "completions/mean_terminated_length": 211.0,
+      "completions/min_length": 70.5,
+      "completions/min_terminated_length": 70.5,
+      "epoch": 3.172250961305336e-05,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 12.591176986694336,
+      "kl": 0.0009377936348755611,
+      "learning_rate": 4.870022949890676e-07,
+      "loss": 0.0,
+      "num_tokens": 35676.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0625,
+      "rewards/format_reward_func/std": 0.1767766922712326,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 465.5,
+      "completions/max_terminated_length": 465.5,
+      "completions/mean_length": 163.375,
+      "completions/mean_terminated_length": 163.375,
+      "completions/min_length": 49.0,
+      "completions/min_terminated_length": 49.0,
+      "epoch": 3.625429670063241e-05,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 10.893583297729492,
+      "kl": 0.001199566273498931,
+      "learning_rate": 4.81355307410676e-07,
+      "loss": 0.0,
+      "num_tokens": 40570.0,
+      "reward": 0.125,
+      "reward_std": 0.3535533845424652,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.125,
+      "rewards/format_reward_func/std": 0.3535533845424652,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 401.0,
+      "completions/max_terminated_length": 401.0,
+      "completions/mean_length": 209.75,
+      "completions/mean_terminated_length": 209.75,
+      "completions/min_length": 63.0,
+      "completions/min_terminated_length": 63.0,
+      "epoch": 4.078608378821146e-05,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0009183982037939131,
+      "kl": 0.0008469254862575326,
+      "learning_rate": 4.747379352713488e-07,
+      "loss": 0.0,
+      "num_tokens": 46174.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 583.0,
+      "completions/max_terminated_length": 583.0,
+      "completions/mean_length": 211.125,
+      "completions/mean_terminated_length": 211.125,
+      "completions/min_length": 69.0,
+      "completions/min_terminated_length": 69.0,
+      "epoch": 4.5317870875790515e-05,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0012323512928560376,
+      "kl": 0.0012486951891332865,
+      "learning_rate": 4.6717793412953776e-07,
+      "loss": 0.0,
+      "num_tokens": 51832.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 620.5,
+      "completions/max_terminated_length": 285.5,
+      "completions/mean_length": 202.9375,
+      "completions/mean_terminated_length": 150.90178680419922,
+      "completions/min_length": 60.5,
+      "completions/min_terminated_length": 60.5,
+      "epoch": 4.984965796336956e-05,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0016945754177868366,
+      "kl": 0.00123220352907083,
+      "learning_rate": 4.5870701325731773e-07,
+      "loss": 0.0,
+      "num_tokens": 57327.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 683.0,
+      "completions/max_terminated_length": 434.5,
+      "completions/mean_length": 257.0,
+      "completions/mean_terminated_length": 209.44644165039062,
+      "completions/min_length": 84.5,
+      "completions/min_terminated_length": 84.5,
+      "epoch": 5.4381445050948616e-05,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.00123355642426759,
+      "kl": 0.0011914248134416994,
+      "learning_rate": 4.4936070264068016e-07,
+      "loss": 0.0,
+      "num_tokens": 63719.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 641.0,
+      "completions/max_terminated_length": 235.5,
+      "completions/mean_length": 181.3125,
+      "completions/mean_terminated_length": 125.05357360839844,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "epoch": 5.891323213852767e-05,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 13.299399375915527,
+      "kl": 0.0018970294222526718,
+      "learning_rate": 4.391782039544238e-07,
+      "loss": 0.0,
+      "num_tokens": 68924.0,
+      "reward": 0.125,
+      "reward_std": 0.3535533845424652,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.125,
+      "rewards/format_reward_func/std": 0.3535533845424652,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 214.5,
+      "completions/max_terminated_length": 214.5,
+      "completions/mean_length": 108.6875,
+      "completions/mean_terminated_length": 108.6875,
+      "completions/min_length": 34.0,
+      "completions/min_terminated_length": 34.0,
+      "epoch": 6.344501922610672e-05,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0035000136122107506,
+      "kl": 0.0019113743601337774,
+      "learning_rate": 4.282022261367073e-07,
+      "loss": 0.0,
+      "num_tokens": 72919.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0625,
+      "rewards/format_reward_func/std": 0.1767766922712326,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 583.5,
+      "completions/max_terminated_length": 583.5,
+      "completions/mean_length": 217.625,
+      "completions/mean_terminated_length": 217.625,
+      "completions/min_length": 37.0,
+      "completions/min_terminated_length": 37.0,
+      "epoch": 6.797680631368577e-05,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0021501195151358843,
+      "kl": 0.0015104188605619129,
+      "learning_rate": 4.1647880625292027e-07,
+      "loss": 0.0,
+      "num_tokens": 78713.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 335.5,
+      "completions/max_terminated_length": 335.5,
+      "completions/mean_length": 134.75,
+      "completions/mean_terminated_length": 134.75,
+      "completions/min_length": 32.0,
+      "completions/min_terminated_length": 32.0,
+      "epoch": 7.250859340126482e-05,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 14.151634216308594,
+      "kl": 0.0026571638591121882,
+      "learning_rate": 4.040571164002318e-07,
+      "loss": 0.0,
+      "num_tokens": 83149.0,
+      "reward": 0.1875,
+      "reward_std": 0.408231720328331,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.1875,
+      "rewards/format_reward_func/std": 0.408231720328331,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 381.5,
+      "completions/max_terminated_length": 381.5,
+      "completions/mean_length": 190.625,
+      "completions/mean_terminated_length": 190.625,
+      "completions/min_length": 34.5,
+      "completions/min_terminated_length": 34.5,
+      "epoch": 7.704038048884388e-05,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.006240386515855789,
+      "kl": 0.0022764305977034383,
+      "learning_rate": 3.909892574627266e-07,
+      "loss": 0.0,
+      "num_tokens": 88487.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0,
+      "rewards/format_reward_func/std": 0.0,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 745.0,
+      "completions/max_terminated_length": 745.0,
+      "completions/mean_length": 219.75,
+      "completions/mean_terminated_length": 219.75,
+      "completions/min_length": 67.5,
+      "completions/min_terminated_length": 67.5,
+      "epoch": 8.157216757642292e-05,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 16.2641544342041,
+      "kl": 0.002281889770529233,
+      "learning_rate": 3.773300405821908e-07,
+      "loss": 0.0,
+      "num_tokens": 94251.0,
+      "reward": 0.125,
+      "reward_std": 0.3535533845424652,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.125,
+      "rewards/format_reward_func/std": 0.3535533845424652,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 436.0,
+      "completions/max_terminated_length": 436.0,
+      "completions/mean_length": 172.5,
+      "completions/mean_terminated_length": 172.5,
+      "completions/min_length": 66.0,
+      "completions/min_terminated_length": 66.0,
+      "epoch": 8.610395466400197e-05,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 15.585210800170898,
+      "kl": 0.010958031325571937,
+      "learning_rate": 3.6313675726113475e-07,
+      "loss": 0.0,
+      "num_tokens": 99331.0,
+      "reward": 0.125,
+      "reward_std": 0.2314550280570984,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.125,
+      "rewards/format_reward_func/std": 0.2314550280570984,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 644.5,
+      "completions/max_terminated_length": 644.5,
+      "completions/mean_length": 238.0,
+      "completions/mean_terminated_length": 238.0,
+      "completions/min_length": 64.0,
+      "completions/min_terminated_length": 64.0,
+      "epoch": 9.063574175158103e-05,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.0022101891227066517,
+      "kl": 0.003324320729007013,
+      "learning_rate": 3.484689390623218e-07,
+      "loss": 0.0,
+      "num_tokens": 105419.0,
+      "reward": 0.125,
+      "reward_std": 0.2314550280570984,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.125,
+      "rewards/format_reward_func/std": 0.2314550280570984,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 427.0,
+      "completions/max_terminated_length": 427.0,
+      "completions/mean_length": 203.3125,
+      "completions/mean_terminated_length": 203.3125,
+      "completions/min_length": 67.0,
+      "completions/min_terminated_length": 67.0,
+      "epoch": 9.516752883916008e-05,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 16.7834529876709,
+      "kl": 0.002571089873526944,
+      "learning_rate": 3.3338810791270517e-07,
+      "loss": 0.0,
+      "num_tokens": 110992.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0625,
+      "rewards/format_reward_func/std": 0.1767766922712326,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 267.5,
+      "completions/max_terminated_length": 267.5,
+      "completions/mean_length": 119.25,
+      "completions/mean_terminated_length": 119.25,
+      "completions/min_length": 55.5,
+      "completions/min_terminated_length": 55.5,
+      "epoch": 9.969931592673912e-05,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 13.29578685760498,
+      "kl": 0.0029905991395935416,
+      "learning_rate": 3.179575180590857e-07,
+      "loss": 0.0,
+      "num_tokens": 115204.0,
+      "reward": 0.125,
+      "reward_std": 0.3535533845424652,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.125,
+      "rewards/format_reward_func/std": 0.3535533845424652,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 326.5,
+      "completions/max_terminated_length": 326.5,
+      "completions/mean_length": 152.5,
+      "completions/mean_terminated_length": 152.5,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.00010423110301431818,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 11.780332565307617,
+      "kl": 0.004898008599411696,
+      "learning_rate": 3.022418907578188e-07,
+      "loss": 0.0,
+      "num_tokens": 119916.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0625,
+      "rewards/format_reward_func/std": 0.1767766922712326,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 756.5,
+      "completions/max_terminated_length": 756.5,
+      "completions/mean_length": 237.625,
+      "completions/mean_terminated_length": 237.625,
+      "completions/min_length": 35.0,
+      "completions/min_terminated_length": 35.0,
+      "epoch": 0.00010876289010189723,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 14.306221008300781,
+      "kl": 0.004835129271668848,
+      "learning_rate": 2.863071428113726e-07,
+      "loss": 0.0,
+      "num_tokens": 125990.0,
+      "reward": 0.1875,
+      "reward_std": 0.408231720328331,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.1875,
+      "rewards/format_reward_func/std": 0.408231720328331,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 405.5,
+      "completions/max_terminated_length": 405.5,
+      "completions/mean_length": 189.9375,
+      "completions/mean_terminated_length": 189.9375,
+      "completions/min_length": 67.0,
+      "completions/min_terminated_length": 67.0,
+      "epoch": 0.00011329467718947628,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 10.71300983428955,
+      "kl": 0.006452581874327734,
+      "learning_rate": 2.7022011009035107e-07,
+      "loss": 0.0,
+      "num_tokens": 131301.0,
+      "reward": 0.125,
+      "reward_std": 0.3535533845424652,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.125,
+      "rewards/format_reward_func/std": 0.3535533845424652,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 361.0,
+      "completions/max_terminated_length": 361.0,
+      "completions/mean_length": 157.0625,
+      "completions/mean_terminated_length": 157.0625,
+      "completions/min_length": 59.0,
+      "completions/min_terminated_length": 59.0,
+      "epoch": 0.00011782646427705534,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 11.726592063903809,
+      "kl": 0.0094611946187797,
+      "learning_rate": 2.540482672006254e-07,
+      "loss": 0.0,
+      "num_tokens": 136102.0,
+      "reward": 0.25,
+      "reward_std": 0.26726123690605164,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.25,
+      "rewards/format_reward_func/std": 0.26726123690605164,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 213.0,
+      "completions/max_terminated_length": 213.0,
+      "completions/mean_length": 117.6875,
+      "completions/mean_terminated_length": 117.6875,
+      "completions/min_length": 44.5,
+      "completions/min_terminated_length": 44.5,
+      "epoch": 0.00012235825136463439,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 13.030102729797363,
+      "kl": 0.006068318209145218,
+      "learning_rate": 2.37859444471388e-07,
+      "loss": 0.0,
+      "num_tokens": 140241.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0625,
+      "rewards/format_reward_func/std": 0.1767766922712326,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 333.0,
+      "completions/max_terminated_length": 333.0,
+      "completions/mean_length": 136.25,
+      "completions/mean_terminated_length": 136.25,
+      "completions/min_length": 36.0,
+      "completions/min_terminated_length": 36.0,
+      "epoch": 0.00012689003845221345,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 10.891292572021484,
+      "kl": 0.009872130773146637,
+      "learning_rate": 2.2172154345117894e-07,
+      "loss": 0.0,
+      "num_tokens": 144701.0,
+      "reward": 0.25,
+      "reward_std": 0.4355512708425522,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.25,
+      "rewards/format_reward_func/std": 0.4355513006448746,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 663.5,
+      "completions/max_terminated_length": 663.5,
+      "completions/mean_length": 196.625,
+      "completions/mean_terminated_length": 196.625,
+      "completions/min_length": 59.5,
+      "completions/min_terminated_length": 59.5,
+      "epoch": 0.00013142182553979248,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 11.184815406799316,
+      "kl": 0.01285637664841488,
+      "learning_rate": 2.0570225210519433e-07,
+      "loss": 0.0,
+      "num_tokens": 150159.0,
+      "reward": 0.1875,
+      "reward_std": 0.408231720328331,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.1875,
+      "rewards/format_reward_func/std": 0.408231720328331,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 308.0,
+      "completions/max_terminated_length": 308.0,
+      "completions/mean_length": 140.4375,
+      "completions/mean_terminated_length": 140.4375,
+      "completions/min_length": 50.0,
+      "completions/min_terminated_length": 50.0,
+      "epoch": 0.00013595361262737154,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.02632570080459118,
+      "kl": 0.014614543215429876,
+      "learning_rate": 1.8986876090843664e-07,
+      "loss": 0.0,
+      "num_tokens": 154654.0,
+      "reward": 0.1875,
+      "reward_std": 0.2587745785713196,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.1875,
+      "rewards/format_reward_func/std": 0.25877460837364197,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 270.0,
+      "completions/max_terminated_length": 270.0,
+      "completions/mean_length": 104.0625,
+      "completions/mean_terminated_length": 104.0625,
+      "completions/min_length": 38.0,
+      "completions/min_terminated_length": 38.0,
+      "epoch": 0.0001404853997149506,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 11.164044380187988,
+      "kl": 0.007626835664268583,
+      "learning_rate": 1.7428748102551234e-07,
+      "loss": 0.0,
+      "num_tokens": 158599.0,
+      "reward": 0.625,
+      "reward_std": 0.49871626496315,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.625,
+      "rewards/format_reward_func/std": 0.49871626496315,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 160.5,
+      "completions/max_terminated_length": 160.5,
+      "completions/mean_length": 90.625,
+      "completions/mean_terminated_length": 90.625,
+      "completions/min_length": 47.0,
+      "completions/min_terminated_length": 47.0,
+      "epoch": 0.00014501718680252963,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 16.527629852294922,
+      "kl": 0.011474673578049988,
+      "learning_rate": 1.5902376575912814e-07,
+      "loss": 0.0,
+      "num_tokens": 162289.0,
+      "reward": 0.3125,
+      "reward_std": 0.44403792917728424,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.3125,
+      "rewards/format_reward_func/std": 0.44403792917728424,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 398.5,
+      "completions/max_terminated_length": 398.5,
+      "completions/mean_length": 129.0625,
+      "completions/mean_terminated_length": 129.0625,
+      "completions/min_length": 47.0,
+      "completions/min_terminated_length": 47.0,
+      "epoch": 0.0001495489738901087,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 12.219941139221191,
+      "kl": 0.009613552174414508,
+      "learning_rate": 1.4414163643562753e-07,
+      "loss": 0.0,
+      "num_tokens": 166674.0,
+      "reward": 0.25,
+      "reward_std": 0.4355512708425522,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.25,
+      "rewards/format_reward_func/std": 0.4355513006448746,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 244.5,
+      "completions/max_terminated_length": 244.5,
+      "completions/mean_length": 109.1875,
+      "completions/mean_terminated_length": 109.1875,
+      "completions/min_length": 45.5,
+      "completions/min_terminated_length": 45.5,
+      "epoch": 0.00015408076097768775,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 20.217954635620117,
+      "kl": 0.020021719275973737,
+      "learning_rate": 1.2970351387729872e-07,
+      "loss": 0.0,
+      "num_tokens": 170693.0,
+      "reward": 0.375,
+      "reward_std": 0.5175491571426392,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.375,
+      "rewards/format_reward_func/std": 0.5175492167472839,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 329.0,
+      "completions/max_terminated_length": 329.0,
+      "completions/mean_length": 127.0625,
+      "completions/mean_terminated_length": 127.0625,
+      "completions/min_length": 45.0,
+      "completions/min_terminated_length": 45.0,
+      "epoch": 0.0001586125480652668,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 12.027848243713379,
+      "kl": 0.01639675306796562,
+      "learning_rate": 1.1576995658775404e-07,
+      "loss": 0.0,
+      "num_tokens": 175014.0,
+      "reward": 0.4375,
+      "reward_std": 0.5260358154773712,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.4375,
+      "rewards/format_reward_func/std": 0.5260358452796936,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 375.0,
+      "completions/max_terminated_length": 375.0,
+      "completions/mean_length": 142.1875,
+      "completions/mean_terminated_length": 142.1875,
+      "completions/min_length": 52.0,
+      "completions/min_terminated_length": 52.0,
+      "epoch": 0.00016314433515284585,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 10.927833557128906,
+      "kl": 0.011894080380443484,
+      "learning_rate": 1.0239940674851941e-07,
+      "loss": 0.0,
+      "num_tokens": 179577.0,
+      "reward": 0.25,
+      "reward_std": 0.4629100561141968,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.25,
+      "rewards/format_reward_func/std": 0.4629100561141968,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 951.0,
+      "completions/max_terminated_length": 951.0,
+      "completions/mean_length": 273.6875,
+      "completions/mean_terminated_length": 273.6875,
+      "completions/min_length": 41.5,
+      "completions/min_terminated_length": 41.5,
+      "epoch": 0.0001676761222404249,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 6.783898830413818,
+      "kl": 0.014139190083369613,
+      "learning_rate": 8.964794509221507e-08,
+      "loss": 0.0,
+      "num_tokens": 186236.0,
+      "reward": 0.375,
+      "reward_std": 0.4355512708425522,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.375,
+      "rewards/format_reward_func/std": 0.4355513006448746,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 174.5,
+      "completions/max_terminated_length": 174.5,
+      "completions/mean_length": 106.25,
+      "completions/mean_terminated_length": 106.25,
+      "completions/min_length": 44.0,
+      "completions/min_terminated_length": 44.0,
+      "epoch": 0.00017220790932800394,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 12.524798393249512,
+      "kl": 0.013552291362429969,
+      "learning_rate": 7.756905568047392e-08,
+      "loss": 0.0,
+      "num_tokens": 190168.0,
+      "reward": 0.5625,
+      "reward_std": 0.5260358154773712,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.5625,
+      "rewards/format_reward_func/std": 0.5260358452796936,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 647.5,
+      "completions/max_terminated_length": 647.5,
+      "completions/mean_length": 207.5625,
+      "completions/mean_terminated_length": 207.5625,
+      "completions/min_length": 46.0,
+      "completions/min_terminated_length": 46.0,
+      "epoch": 0.000176739696415583,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 10.13427448272705,
+      "kl": 0.01421075320104137,
+      "learning_rate": 6.621340157319996e-08,
+      "loss": 0.0,
+      "num_tokens": 195793.0,
+      "reward": 0.375,
+      "reward_std": 0.4355512708425522,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.375,
+      "rewards/format_reward_func/std": 0.4355513006448746,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 620.0,
+      "completions/max_terminated_length": 281.0,
+      "completions/mean_length": 187.5625,
+      "completions/mean_terminated_length": 134.85714721679688,
+      "completions/min_length": 40.0,
+      "completions/min_terminated_length": 40.0,
+      "epoch": 0.00018127148350316206,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 16.1104793548584,
+      "kl": 0.014304678879852872,
+      "learning_rate": 5.5628612330087724e-08,
+      "loss": 0.0,
+      "num_tokens": 201050.0,
+      "reward": 0.25,
+      "reward_std": 0.4355512708425522,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.25,
+      "rewards/format_reward_func/std": 0.4355513006448746,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 746.0,
+      "completions/max_terminated_length": 345.5,
+      "completions/mean_length": 174.6875,
+      "completions/mean_terminated_length": 116.39286041259766,
+      "completions/min_length": 50.5,
+      "completions/min_terminated_length": 50.5,
+      "epoch": 0.0001858032705907411,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 13.687472343444824,
+      "kl": 0.012647167037357576,
+      "learning_rate": 4.5859084235697235e-08,
+      "loss": 0.0,
+      "num_tokens": 206125.0,
+      "reward": 0.25,
+      "reward_std": 0.4629100561141968,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.25,
+      "rewards/format_reward_func/std": 0.4629100561141968,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 240.5,
+      "completions/max_terminated_length": 240.5,
+      "completions/mean_length": 104.75,
+      "completions/mean_terminated_length": 104.75,
+      "completions/min_length": 56.0,
+      "completions/min_terminated_length": 56.0,
+      "epoch": 0.00019033505767832015,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 27.33205795288086,
+      "kl": 0.01843659658334218,
+      "learning_rate": 3.6945794086007705e-08,
+      "loss": 0.0,
+      "num_tokens": 210097.0,
+      "reward": 0.625,
+      "reward_std": 0.49871626496315,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.625,
+      "rewards/format_reward_func/std": 0.49871626496315,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 516.5,
+      "completions/max_terminated_length": 516.5,
+      "completions/mean_length": 146.9375,
+      "completions/mean_terminated_length": 146.9375,
+      "completions/min_length": 45.0,
+      "completions/min_terminated_length": 45.0,
+      "epoch": 0.00019486684476589921,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 17.247251510620117,
+      "kl": 0.014375057930010371,
+      "learning_rate": 2.892612731749414e-08,
+      "loss": 0.0,
+      "num_tokens": 214696.0,
+      "reward": 0.375,
+      "reward_std": 0.5175491571426392,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.375,
+      "rewards/format_reward_func/std": 0.5175492167472839,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 176.0,
+      "completions/max_terminated_length": 176.0,
+      "completions/mean_length": 92.375,
+      "completions/mean_terminated_length": 92.375,
+      "completions/min_length": 39.0,
+      "completions/min_terminated_length": 39.0,
+      "epoch": 0.00019939863185347825,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 12.001086235046387,
+      "kl": 0.015669465501559898,
+      "learning_rate": 2.183372119961499e-08,
+      "loss": 0.0,
+      "num_tokens": 218470.0,
+      "reward": 0.1875,
+      "reward_std": 0.408231720328331,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.1875,
+      "rewards/format_reward_func/std": 0.408231720328331,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 522.0,
+      "completions/max_terminated_length": 522.0,
+      "completions/mean_length": 174.0,
+      "completions/mean_terminated_length": 174.0,
+      "completions/min_length": 55.5,
+      "completions/min_terminated_length": 55.5,
+      "epoch": 0.0002039304189410573,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 11.844170570373535,
+      "kl": 0.009546459768898785,
+      "learning_rate": 1.5698323748414122e-08,
+      "loss": 0.0,
+      "num_tokens": 223534.0,
+      "reward": 0.375,
+      "reward_std": 0.4355512708425522,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.375,
+      "rewards/format_reward_func/std": 0.4355513006448746,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 331.5,
+      "completions/max_terminated_length": 331.5,
+      "completions/mean_length": 151.8125,
+      "completions/mean_terminated_length": 151.8125,
+      "completions/min_length": 48.5,
+      "completions/min_terminated_length": 48.5,
+      "epoch": 0.00020846220602863637,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 11.816337585449219,
+      "kl": 0.01593888070783578,
+      "learning_rate": 1.054566895300324e-08,
+      "loss": 0.0,
+      "num_tokens": 228243.0,
+      "reward": 0.5625,
+      "reward_std": 0.5260358154773712,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.5625,
+      "rewards/format_reward_func/std": 0.5260358452796936,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 387.0,
+      "completions/max_terminated_length": 387.0,
+      "completions/mean_length": 163.6875,
+      "completions/mean_terminated_length": 163.6875,
+      "completions/min_length": 33.0,
+      "completions/min_terminated_length": 33.0,
+      "epoch": 0.0002129939931162154,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 11.730420112609863,
+      "kl": 0.01412112163961865,
+      "learning_rate": 6.397368838268496e-09,
+      "loss": 0.0,
+      "num_tokens": 233110.0,
+      "reward": 0.1875,
+      "reward_std": 0.408231720328331,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.1875,
+      "rewards/format_reward_func/std": 0.408231720328331,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 420.0,
+      "completions/max_terminated_length": 420.0,
+      "completions/mean_length": 192.4375,
+      "completions/mean_terminated_length": 192.4375,
+      "completions/min_length": 50.0,
+      "completions/min_terminated_length": 50.0,
+      "epoch": 0.00021752578020379446,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 15.307268142700195,
+      "kl": 0.00981484999647364,
+      "learning_rate": 3.2708228165273244e-09,
+      "loss": 0.0,
+      "num_tokens": 238493.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.0625,
+      "rewards/format_reward_func/std": 0.1767766922712326,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 372.5,
+      "completions/max_terminated_length": 372.5,
+      "completions/mean_length": 108.1875,
+      "completions/mean_terminated_length": 108.1875,
+      "completions/min_length": 37.0,
+      "completions/min_terminated_length": 37.0,
+      "epoch": 0.00022205756729137352,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 23.4942684173584,
+      "kl": 0.022864260390633717,
+      "learning_rate": 1.1791447083465133e-09,
+      "loss": 0.0,
+      "num_tokens": 242536.0,
+      "reward": 0.25,
+      "reward_std": 0.4629100561141968,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.25,
+      "rewards/format_reward_func/std": 0.4629100561141968,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 426.0,
+      "completions/max_terminated_length": 426.0,
+      "completions/mean_length": 194.3125,
+      "completions/mean_terminated_length": 194.3125,
+      "completions/min_length": 37.0,
+      "completions/min_terminated_length": 37.0,
+      "epoch": 0.00022658935437895256,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 23.61033058166504,
+      "kl": 0.0231838297622744,
+      "learning_rate": 1.3110773862126667e-10,
+      "loss": 0.0,
+      "num_tokens": 247917.0,
+      "reward": 0.4375,
+      "reward_std": 0.5260358154773712,
+      "rewards/equation_reward_func/mean": 0.0,
+      "rewards/equation_reward_func/std": 0.0,
+      "rewards/format_reward_func/mean": 0.4375,
+      "rewards/format_reward_func/std": 0.5260358452796936,
+      "step": 100
+    },
+    {
+      "epoch": 0.00022658935437895256,
+      "step": 100,
+      "total_flos": 0.0,
+      "train_loss": 8.04626297735922e-06,
+      "train_runtime": 2640.6632,
+      "train_samples_per_second": 0.303,
+      "train_steps_per_second": 0.038
+    }
+  ],
+  "logging_steps": 2,
+  "max_steps": 100,
+  "num_input_tokens_seen": 247917,
+  "num_train_epochs": 1,
+  "save_steps": 25,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}