jdqqjr commited on
Commit
0feded0
·
verified ·
1 Parent(s): c25fa5f

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.014638028192195337,
4
+ "train_runtime": 22832.117,
5
+ "train_samples": 11742,
6
+ "train_samples_per_second": 0.077,
7
+ "train_steps_per_second": 0.01
8
+ }
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151643,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1536,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 8960,
12
+ "max_position_embeddings": 131072,
13
+ "max_window_layers": 21,
14
+ "model_type": "qwen2",
15
+ "num_attention_heads": 12,
16
+ "num_hidden_layers": 28,
17
+ "num_key_value_heads": 2,
18
+ "rms_norm_eps": 1e-06,
19
+ "rope_scaling": null,
20
+ "rope_theta": 10000,
21
+ "sliding_window": 4096,
22
+ "tie_word_embeddings": false,
23
+ "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.50.0.dev0",
25
+ "use_cache": false,
26
+ "use_mrope": false,
27
+ "use_sliding_window": false,
28
+ "vocab_size": 151936
29
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151646,
4
+ "do_sample": true,
5
+ "eos_token_id": 151643,
6
+ "temperature": 0.6,
7
+ "top_p": 0.95,
8
+ "transformers_version": "4.50.0.dev0"
9
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cf6811f62b78ba049e4704a36fe4435c8c5c001eca3898352cbf216a441f049
3
+ size 3554214752
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|end▁of▁sentence|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|end▁of▁sentence|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a
3
+ size 11422959
tokenizer_config.json ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "151643": {
7
+ "content": "<|end▁of▁sentence|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "151644": {
15
+ "content": "<|User|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": false
21
+ },
22
+ "151645": {
23
+ "content": "<|Assistant|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": false
29
+ },
30
+ "151646": {
31
+ "content": "<|begin▁of▁sentence|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "151647": {
39
+ "content": "<|EOT|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": false
45
+ },
46
+ "151648": {
47
+ "content": "<think>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": false
53
+ },
54
+ "151649": {
55
+ "content": "</think>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": false
61
+ },
62
+ "151650": {
63
+ "content": "<|quad_start|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "151651": {
71
+ "content": "<|quad_end|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "151652": {
79
+ "content": "<|vision_start|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "151653": {
87
+ "content": "<|vision_end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "151654": {
95
+ "content": "<|vision_pad|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "151655": {
103
+ "content": "<|image_pad|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "151656": {
111
+ "content": "<|video_pad|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "151657": {
119
+ "content": "<tool_call>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "151658": {
127
+ "content": "</tool_call>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "151659": {
135
+ "content": "<|fim_prefix|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "151660": {
143
+ "content": "<|fim_middle|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "151661": {
151
+ "content": "<|fim_suffix|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "151662": {
159
+ "content": "<|fim_pad|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "151663": {
167
+ "content": "<|repo_name|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "151664": {
175
+ "content": "<|file_sep|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ }
182
+ },
183
+ "bos_token": "<|begin▁of▁sentence|>",
184
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin��>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|><think>\\n'}}{% endif %}",
185
+ "clean_up_tokenization_spaces": false,
186
+ "eos_token": "<|end▁of▁sentence|>",
187
+ "extra_special_tokens": {},
188
+ "legacy": true,
189
+ "model_max_length": 16384,
190
+ "pad_token": "<|end▁of▁sentence|>",
191
+ "padding_side": "left",
192
+ "sp_model_kwargs": {},
193
+ "tokenizer_class": "LlamaTokenizerFast",
194
+ "unk_token": null,
195
+ "use_default_system_prompt": false
196
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.014638028192195337,
4
+ "train_runtime": 22832.117,
5
+ "train_samples": 11742,
6
+ "train_samples_per_second": 0.077,
7
+ "train_steps_per_second": 0.01
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1733 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.15057060126043265,
6
+ "eval_steps": 100,
7
+ "global_step": 221,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "completion_length": 701.7250213623047,
14
+ "epoch": 0.0013626298756600238,
15
+ "grad_norm": 0.826935350894928,
16
+ "kl": 0.0,
17
+ "learning_rate": 1.7391304347826088e-06,
18
+ "loss": -0.0,
19
+ "reward": 3.2898273691534996,
20
+ "reward_std": 1.2225780645385385,
21
+ "rewards/AnswerChecker_LenCheck_Reward": 0.41482741106301546,
22
+ "rewards/all-MiniLM-L6-v2": -0.07499999832361937,
23
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
24
+ "rewards/length_reward": 0.9500000029802322,
25
+ "step": 2
26
+ },
27
+ {
28
+ "completion_length": 832.4125137329102,
29
+ "epoch": 0.0027252597513200477,
30
+ "grad_norm": 0.6933600902557373,
31
+ "kl": 0.00024503469467163086,
32
+ "learning_rate": 3.4782608695652175e-06,
33
+ "loss": 0.0,
34
+ "reward": 3.45002019405365,
35
+ "reward_std": 1.13982825120911,
36
+ "rewards/AnswerChecker_LenCheck_Reward": 0.4375201063230634,
37
+ "rewards/all-MiniLM-L6-v2": 0.11249999888241291,
38
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
39
+ "rewards/length_reward": 0.9000000059604645,
40
+ "step": 4
41
+ },
42
+ {
43
+ "completion_length": 731.8750114440918,
44
+ "epoch": 0.004087889626980072,
45
+ "grad_norm": 0.7940042614936829,
46
+ "kl": 0.0002880692481994629,
47
+ "learning_rate": 5.2173913043478265e-06,
48
+ "loss": 0.0,
49
+ "reward": 2.9608234241604805,
50
+ "reward_std": 1.052665668539703,
51
+ "rewards/AnswerChecker_LenCheck_Reward": 0.4483234500512481,
52
+ "rewards/all-MiniLM-L6-v2": -0.3874999973922968,
53
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
54
+ "rewards/length_reward": 0.9000000013038516,
55
+ "step": 6
56
+ },
57
+ {
58
+ "completion_length": 787.5625152587891,
59
+ "epoch": 0.005450519502640095,
60
+ "grad_norm": 0.7262738347053528,
61
+ "kl": 0.00028836727142333984,
62
+ "learning_rate": 6.956521739130435e-06,
63
+ "loss": 0.0,
64
+ "reward": 2.5098357554525137,
65
+ "reward_std": 1.7445468110963702,
66
+ "rewards/AnswerChecker_LenCheck_Reward": 0.409835753031075,
67
+ "rewards/all-MiniLM-L6-v2": -0.4249999960884452,
68
+ "rewards/format_reward_beginEndSolutionFormat": 1.8499999977648258,
69
+ "rewards/length_reward": 0.6750000044703484,
70
+ "step": 8
71
+ },
72
+ {
73
+ "completion_length": 751.5000133514404,
74
+ "epoch": 0.00681314937830012,
75
+ "grad_norm": 0.7745183110237122,
76
+ "kl": 0.0004850625991821289,
77
+ "learning_rate": 8.695652173913044e-06,
78
+ "loss": 0.0,
79
+ "reward": 2.9620843827724457,
80
+ "reward_std": 1.278110965853557,
81
+ "rewards/AnswerChecker_LenCheck_Reward": 0.47458438016474247,
82
+ "rewards/all-MiniLM-L6-v2": -0.2749999985098839,
83
+ "rewards/format_reward_beginEndSolutionFormat": 1.9624999985098839,
84
+ "rewards/length_reward": 0.8000000091269612,
85
+ "step": 10
86
+ },
87
+ {
88
+ "completion_length": 744.9250144958496,
89
+ "epoch": 0.008175779253960144,
90
+ "grad_norm": 0.7855518460273743,
91
+ "kl": 0.0019762516021728516,
92
+ "learning_rate": 1.0434782608695653e-05,
93
+ "loss": 0.0001,
94
+ "reward": 2.617582857608795,
95
+ "reward_std": 1.4501504292711616,
96
+ "rewards/AnswerChecker_LenCheck_Reward": 0.41758283600211143,
97
+ "rewards/all-MiniLM-L6-v2": -0.49999999813735485,
98
+ "rewards/format_reward_beginEndSolutionFormat": 1.9249999970197678,
99
+ "rewards/length_reward": 0.7750000087544322,
100
+ "step": 12
101
+ },
102
+ {
103
+ "completion_length": 853.975025177002,
104
+ "epoch": 0.009538409129620166,
105
+ "grad_norm": 0.7132939100265503,
106
+ "kl": 0.0023059844970703125,
107
+ "learning_rate": 1.2173913043478263e-05,
108
+ "loss": 0.0001,
109
+ "reward": 3.0857974849641323,
110
+ "reward_std": 1.1976260086521506,
111
+ "rewards/AnswerChecker_LenCheck_Reward": 0.3857975276187062,
112
+ "rewards/all-MiniLM-L6-v2": -0.11249999608844519,
113
+ "rewards/format_reward_beginEndSolutionFormat": 1.9624999985098839,
114
+ "rewards/length_reward": 0.850000006146729,
115
+ "step": 14
116
+ },
117
+ {
118
+ "completion_length": 809.7375106811523,
119
+ "epoch": 0.01090103900528019,
120
+ "grad_norm": 0.7411205172538757,
121
+ "kl": 0.0045413970947265625,
122
+ "learning_rate": 1.391304347826087e-05,
123
+ "loss": 0.0002,
124
+ "reward": 3.3214251101017,
125
+ "reward_std": 0.94907569559291,
126
+ "rewards/AnswerChecker_LenCheck_Reward": 0.40892498986795545,
127
+ "rewards/all-MiniLM-L6-v2": 0.037500000558793545,
128
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
129
+ "rewards/length_reward": 0.8750000046566129,
130
+ "step": 16
131
+ },
132
+ {
133
+ "completion_length": 813.1250114440918,
134
+ "epoch": 0.012263668880940215,
135
+ "grad_norm": 14.433080673217773,
136
+ "kl": 0.2021026611328125,
137
+ "learning_rate": 1.565217391304348e-05,
138
+ "loss": 0.0081,
139
+ "reward": 3.0686260610818863,
140
+ "reward_std": 1.5170891038142145,
141
+ "rewards/AnswerChecker_LenCheck_Reward": 0.48112595453858376,
142
+ "rewards/all-MiniLM-L6-v2": -0.2374999951571226,
143
+ "rewards/format_reward_beginEndSolutionFormat": 1.9249999970197678,
144
+ "rewards/length_reward": 0.9000000059604645,
145
+ "step": 18
146
+ },
147
+ {
148
+ "completion_length": 754.1750144958496,
149
+ "epoch": 0.01362629875660024,
150
+ "grad_norm": 2.9093778133392334,
151
+ "kl": 0.06426239013671875,
152
+ "learning_rate": 1.739130434782609e-05,
153
+ "loss": 0.0026,
154
+ "reward": 3.807109519839287,
155
+ "reward_std": 0.7755511063151062,
156
+ "rewards/AnswerChecker_LenCheck_Reward": 0.5696093924343586,
157
+ "rewards/all-MiniLM-L6-v2": 0.2875000014901161,
158
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
159
+ "rewards/length_reward": 0.9500000029802322,
160
+ "step": 20
161
+ },
162
+ {
163
+ "completion_length": 758.8125038146973,
164
+ "epoch": 0.014988928632260262,
165
+ "grad_norm": 0.8031071424484253,
166
+ "kl": 0.020557403564453125,
167
+ "learning_rate": 1.9130434782608697e-05,
168
+ "loss": 0.0008,
169
+ "reward": 3.0672582909464836,
170
+ "reward_std": 0.7253409158438444,
171
+ "rewards/AnswerChecker_LenCheck_Reward": 0.5797582101076841,
172
+ "rewards/all-MiniLM-L6-v2": -0.46249999944120646,
173
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
174
+ "rewards/length_reward": 0.9500000029802322,
175
+ "step": 22
176
+ },
177
+ {
178
+ "completion_length": 738.9500198364258,
179
+ "epoch": 0.016351558507920288,
180
+ "grad_norm": 0.7532008290290833,
181
+ "kl": 0.036285400390625,
182
+ "learning_rate": 1.9998741276738753e-05,
183
+ "loss": 0.0015,
184
+ "reward": 3.575051836669445,
185
+ "reward_std": 0.8001346649834886,
186
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6000517811626196,
187
+ "rewards/all-MiniLM-L6-v2": 0.025000003166496754,
188
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
189
+ "rewards/length_reward": 0.9500000029802322,
190
+ "step": 24
191
+ },
192
+ {
193
+ "completion_length": 772.2625160217285,
194
+ "epoch": 0.01771418838358031,
195
+ "grad_norm": 0.7200579643249512,
196
+ "kl": 0.08385467529296875,
197
+ "learning_rate": 1.9988673391830082e-05,
198
+ "loss": 0.0034,
199
+ "reward": 3.486709274351597,
200
+ "reward_std": 1.1654404955916107,
201
+ "rewards/AnswerChecker_LenCheck_Reward": 0.5367092387750745,
202
+ "rewards/all-MiniLM-L6-v2": 0.02500000409781933,
203
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
204
+ "rewards/length_reward": 0.9250000044703484,
205
+ "step": 26
206
+ },
207
+ {
208
+ "completion_length": 759.0875091552734,
209
+ "epoch": 0.019076818259240333,
210
+ "grad_norm": 0.7885477542877197,
211
+ "kl": 0.0690155029296875,
212
+ "learning_rate": 1.9968547759519426e-05,
213
+ "loss": 0.0028,
214
+ "reward": 3.937687858939171,
215
+ "reward_std": 0.8582523697987199,
216
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6376878507435322,
217
+ "rewards/all-MiniLM-L6-v2": 0.4000000022351742,
218
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
219
+ "rewards/length_reward": 0.9000000031664968,
220
+ "step": 28
221
+ },
222
+ {
223
+ "completion_length": 733.7500076293945,
224
+ "epoch": 0.020439448134900357,
225
+ "grad_norm": 0.6713956594467163,
226
+ "kl": 0.0721435546875,
227
+ "learning_rate": 1.9938384644612542e-05,
228
+ "loss": 0.0029,
229
+ "reward": 3.545550711452961,
230
+ "reward_std": 1.1149344076402485,
231
+ "rewards/AnswerChecker_LenCheck_Reward": 0.5705506596714258,
232
+ "rewards/all-MiniLM-L6-v2": 0.0625000037252903,
233
+ "rewards/format_reward_beginEndSolutionFormat": 1.9624999985098839,
234
+ "rewards/length_reward": 0.9500000029802322,
235
+ "step": 30
236
+ },
237
+ {
238
+ "completion_length": 793.4125213623047,
239
+ "epoch": 0.02180207801056038,
240
+ "grad_norm": 0.6639223098754883,
241
+ "kl": 0.06781005859375,
242
+ "learning_rate": 1.989821441880933e-05,
243
+ "loss": 0.0027,
244
+ "reward": 3.9647994488477707,
245
+ "reward_std": 1.1172373143490404,
246
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6397993620485067,
247
+ "rewards/all-MiniLM-L6-v2": 0.4375,
248
+ "rewards/format_reward_beginEndSolutionFormat": 1.9624999985098839,
249
+ "rewards/length_reward": 0.9250000044703484,
250
+ "step": 32
251
+ },
252
+ {
253
+ "completion_length": 713.4125213623047,
254
+ "epoch": 0.023164707886220406,
255
+ "grad_norm": 0.6671966910362244,
256
+ "kl": 0.0789947509765625,
257
+ "learning_rate": 1.9848077530122083e-05,
258
+ "loss": 0.0032,
259
+ "reward": 4.12811366468668,
260
+ "reward_std": 0.6696655503474176,
261
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6656134920194745,
262
+ "rewards/all-MiniLM-L6-v2": 0.5125000011175871,
263
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
264
+ "rewards/length_reward": 0.9500000001862645,
265
+ "step": 34
266
+ },
267
+ {
268
+ "completion_length": 621.1875171661377,
269
+ "epoch": 0.02452733776188043,
270
+ "grad_norm": 0.6927969455718994,
271
+ "kl": 0.100311279296875,
272
+ "learning_rate": 1.978802446214779e-05,
273
+ "loss": 0.004,
274
+ "reward": 3.9949512034654617,
275
+ "reward_std": 1.0779171623289585,
276
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6824510861188173,
277
+ "rewards/all-MiniLM-L6-v2": 0.3375000050291419,
278
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
279
+ "rewards/length_reward": 0.9750000014901161,
280
+ "step": 36
281
+ },
282
+ {
283
+ "completion_length": 561.4625148773193,
284
+ "epoch": 0.025889967637540454,
285
+ "grad_norm": 0.8079332113265991,
286
+ "kl": 0.105621337890625,
287
+ "learning_rate": 1.9718115683235418e-05,
288
+ "loss": 0.0042,
289
+ "reward": 3.3487851433455944,
290
+ "reward_std": 1.0449772235006094,
291
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6237850617617369,
292
+ "rewards/all-MiniLM-L6-v2": -0.049999997951090336,
293
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
294
+ "rewards/length_reward": 0.7750000106170774,
295
+ "step": 38
296
+ },
297
+ {
298
+ "completion_length": 600.0875129699707,
299
+ "epoch": 0.02725259751320048,
300
+ "grad_norm": 0.7116134762763977,
301
+ "kl": 0.100372314453125,
302
+ "learning_rate": 1.9638421585599422e-05,
303
+ "loss": 0.004,
304
+ "reward": 4.384867042303085,
305
+ "reward_std": 0.719090289901942,
306
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7348670326173306,
307
+ "rewards/all-MiniLM-L6-v2": 0.7750000013038516,
308
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
309
+ "rewards/length_reward": 0.8750000074505806,
310
+ "step": 40
311
+ },
312
+ {
313
+ "completion_length": 598.8750190734863,
314
+ "epoch": 0.0286152273888605,
315
+ "grad_norm": 0.7759160399436951,
316
+ "kl": 0.143280029296875,
317
+ "learning_rate": 1.9549022414440738e-05,
318
+ "loss": 0.0057,
319
+ "reward": 3.826561488211155,
320
+ "reward_std": 0.9282979969866574,
321
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6765614040195942,
322
+ "rewards/all-MiniLM-L6-v2": 0.2249999986961484,
323
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
324
+ "rewards/length_reward": 0.9250000044703484,
325
+ "step": 42
326
+ },
327
+ {
328
+ "completion_length": 625.4625110626221,
329
+ "epoch": 0.029977857264520524,
330
+ "grad_norm": 0.7551826238632202,
331
+ "kl": 0.116241455078125,
332
+ "learning_rate": 1.9450008187146685e-05,
333
+ "loss": 0.0046,
334
+ "reward": 4.079624235630035,
335
+ "reward_std": 1.025521146133542,
336
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7171240504831076,
337
+ "rewards/all-MiniLM-L6-v2": 0.4375000009313226,
338
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
339
+ "rewards/length_reward": 0.9250000016763806,
340
+ "step": 44
341
+ },
342
+ {
343
+ "completion_length": 759.9500160217285,
344
+ "epoch": 0.03134048714018055,
345
+ "grad_norm": 0.6660957932472229,
346
+ "kl": 0.10888671875,
347
+ "learning_rate": 1.9341478602651068e-05,
348
+ "loss": 0.0044,
349
+ "reward": 3.9495397955179214,
350
+ "reward_std": 1.2625608062371612,
351
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7370398119091988,
352
+ "rewards/all-MiniLM-L6-v2": 0.3625000026077032,
353
+ "rewards/format_reward_beginEndSolutionFormat": 1.9249999970197678,
354
+ "rewards/length_reward": 0.9250000016763806,
355
+ "step": 46
356
+ },
357
+ {
358
+ "completion_length": 826.0875129699707,
359
+ "epoch": 0.032703117015840576,
360
+ "grad_norm": 0.6011426448822021,
361
+ "kl": 0.118682861328125,
362
+ "learning_rate": 1.9223542941045817e-05,
363
+ "loss": 0.0047,
364
+ "reward": 3.5684720501303673,
365
+ "reward_std": 0.6631585191935301,
366
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6809719651937485,
367
+ "rewards/all-MiniLM-L6-v2": -0.012500000186264515,
368
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
369
+ "rewards/length_reward": 0.9000000031664968,
370
+ "step": 48
371
+ },
372
+ {
373
+ "completion_length": 938.2875137329102,
374
+ "epoch": 0.034065746891500596,
375
+ "grad_norm": 0.6014713048934937,
376
+ "kl": 0.126922607421875,
377
+ "learning_rate": 1.9096319953545186e-05,
378
+ "loss": 0.0051,
379
+ "reward": 3.5386421233415604,
380
+ "reward_std": 1.796545386314392,
381
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6011420339345932,
382
+ "rewards/all-MiniLM-L6-v2": 0.4000000022351742,
383
+ "rewards/format_reward_beginEndSolutionFormat": 1.8124999925494194,
384
+ "rewards/length_reward": 0.7250000089406967,
385
+ "step": 50
386
+ },
387
+ {
388
+ "completion_length": 936.8000183105469,
389
+ "epoch": 0.03542837676716062,
390
+ "grad_norm": 0.623587429523468,
391
+ "kl": 0.138763427734375,
392
+ "learning_rate": 1.895993774291336e-05,
393
+ "loss": 0.0056,
394
+ "reward": 3.6302099227905273,
395
+ "reward_std": 1.3007243419997394,
396
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6302098724991083,
397
+ "rewards/all-MiniLM-L6-v2": 0.3375000022351742,
398
+ "rewards/format_reward_beginEndSolutionFormat": 1.9624999985098839,
399
+ "rewards/length_reward": 0.7000000076368451,
400
+ "step": 52
401
+ },
402
+ {
403
+ "completion_length": 774.3875160217285,
404
+ "epoch": 0.036791006642820645,
405
+ "grad_norm": 0.5999850630760193,
406
+ "kl": 0.15771484375,
407
+ "learning_rate": 1.881453363447582e-05,
408
+ "loss": 0.0063,
409
+ "reward": 4.047312401235104,
410
+ "reward_std": 0.8825826905667782,
411
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6848122831434011,
412
+ "rewards/all-MiniLM-L6-v2": 0.4125000024214387,
413
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
414
+ "rewards/length_reward": 0.9500000029802322,
415
+ "step": 54
416
+ },
417
+ {
418
+ "completion_length": 664.0375175476074,
419
+ "epoch": 0.038153636518480666,
420
+ "grad_norm": 0.6805257201194763,
421
+ "kl": 0.1588134765625,
422
+ "learning_rate": 1.866025403784439e-05,
423
+ "loss": 0.0064,
424
+ "reward": 3.92198596149683,
425
+ "reward_std": 0.7282474122475833,
426
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6469858698546886,
427
+ "rewards/all-MiniLM-L6-v2": 0.34999999962747097,
428
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
429
+ "rewards/length_reward": 0.9250000016763806,
430
+ "step": 56
431
+ },
432
+ {
433
+ "completion_length": 702.6375160217285,
434
+ "epoch": 0.039516266394140694,
435
+ "grad_norm": 0.7188987135887146,
436
+ "kl": 0.14849853515625,
437
+ "learning_rate": 1.8497254299495147e-05,
438
+ "loss": 0.0059,
439
+ "reward": 3.4211702197790146,
440
+ "reward_std": 1.4494188777171075,
441
+ "rewards/AnswerChecker_LenCheck_Reward": 0.5586702147556935,
442
+ "rewards/all-MiniLM-L6-v2": 0.2875000052154064,
443
+ "rewards/format_reward_beginEndSolutionFormat": 1.8499999977648258,
444
+ "rewards/length_reward": 0.725000006146729,
445
+ "step": 58
446
+ },
447
+ {
448
+ "completion_length": 641.7625064849854,
449
+ "epoch": 0.040878896269800714,
450
+ "grad_norm": 0.9523748159408569,
451
+ "kl": 0.1611328125,
452
+ "learning_rate": 1.8325698546347714e-05,
453
+ "loss": 0.0064,
454
+ "reward": 3.841396249830723,
455
+ "reward_std": 1.1254777936264873,
456
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6538961809128523,
457
+ "rewards/all-MiniLM-L6-v2": 0.43750000186264515,
458
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
459
+ "rewards/length_reward": 0.7500000093132257,
460
+ "step": 60
461
+ },
462
+ {
463
+ "completion_length": 704.0375175476074,
464
+ "epoch": 0.04224152614546074,
465
+ "grad_norm": 0.7048397064208984,
466
+ "kl": 0.164093017578125,
467
+ "learning_rate": 1.814575952050336e-05,
468
+ "loss": 0.0066,
469
+ "reward": 3.5937935784459114,
470
+ "reward_std": 1.1683010547421873,
471
+ "rewards/AnswerChecker_LenCheck_Reward": 0.71879349835217,
472
+ "rewards/all-MiniLM-L6-v2": 0.13750000018626451,
473
+ "rewards/format_reward_beginEndSolutionFormat": 1.9624999985098839,
474
+ "rewards/length_reward": 0.7750000059604645,
475
+ "step": 62
476
+ },
477
+ {
478
+ "completion_length": 609.1125087738037,
479
+ "epoch": 0.04360415602112076,
480
+ "grad_norm": 0.8203504085540771,
481
+ "kl": 0.178314208984375,
482
+ "learning_rate": 1.7957618405308323e-05,
483
+ "loss": 0.0071,
484
+ "reward": 3.6722382232546806,
485
+ "reward_std": 1.1893206317909062,
486
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6222381666302681,
487
+ "rewards/all-MiniLM-L6-v2": 0.3000000035390258,
488
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
489
+ "rewards/length_reward": 0.7500000093132257,
490
+ "step": 64
491
+ },
492
+ {
493
+ "completion_length": 640.1500129699707,
494
+ "epoch": 0.044966785896780784,
495
+ "grad_norm": 0.7613713145256042,
496
+ "kl": 0.1834259033203125,
497
+ "learning_rate": 1.776146464291757e-05,
498
+ "loss": 0.0073,
499
+ "reward": 4.024308562278748,
500
+ "reward_std": 1.0513806878589094,
501
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6868084445595741,
502
+ "rewards/all-MiniLM-L6-v2": 0.3624999988824129,
503
+ "rewards/format_reward_beginEndSolutionFormat": 2.0,
504
+ "rewards/length_reward": 0.9750000014901161,
505
+ "step": 66
506
+ },
507
+ {
508
+ "completion_length": 793.487512588501,
509
+ "epoch": 0.04632941577244081,
510
+ "grad_norm": 0.7231267094612122,
511
+ "kl": 0.26812744140625,
512
+ "learning_rate": 1.7557495743542586e-05,
513
+ "loss": 0.0107,
514
+ "reward": 2.855987273156643,
515
+ "reward_std": 1.688657500082627,
516
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6434871796518564,
517
+ "rewards/all-MiniLM-L6-v2": -0.012499998323619366,
518
+ "rewards/format_reward_beginEndSolutionFormat": 1.8499999986961484,
519
+ "rewards/length_reward": 0.3750000046566129,
520
+ "step": 68
521
+ },
522
+ {
523
+ "completion_length": 789.1500263214111,
524
+ "epoch": 0.04769204564810083,
525
+ "grad_norm": 0.9249826669692993,
526
+ "kl": 0.33258056640625,
527
+ "learning_rate": 1.734591708657533e-05,
528
+ "loss": 0.0133,
529
+ "reward": 2.242995586246252,
530
+ "reward_std": 2.346400245092809,
531
+ "rewards/AnswerChecker_LenCheck_Reward": 0.5554955210536718,
532
+ "rewards/all-MiniLM-L6-v2": -0.19999999832361937,
533
+ "rewards/format_reward_beginEndSolutionFormat": 1.6624999940395355,
534
+ "rewards/length_reward": 0.22500000149011612,
535
+ "step": 70
536
+ },
537
+ {
538
+ "completion_length": 702.3250160217285,
539
+ "epoch": 0.04905467552376086,
540
+ "grad_norm": 1.8796899318695068,
541
+ "kl": 0.3321533203125,
542
+ "learning_rate": 1.7126941713788633e-05,
543
+ "loss": 0.0133,
544
+ "reward": 3.2902688309550285,
545
+ "reward_std": 1.93663672497496,
546
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7652687635272741,
547
+ "rewards/all-MiniLM-L6-v2": 0.17500000167638063,
548
+ "rewards/format_reward_beginEndSolutionFormat": 1.8500000014901161,
549
+ "rewards/length_reward": 0.500000006519258,
550
+ "step": 72
551
+ },
552
+ {
553
+ "completion_length": 796.2000179290771,
554
+ "epoch": 0.05041730539942088,
555
+ "grad_norm": 1.0197867155075073,
556
+ "kl": 0.45782470703125,
557
+ "learning_rate": 1.6900790114821122e-05,
558
+ "loss": 0.0183,
559
+ "reward": 2.451604187488556,
560
+ "reward_std": 2.372755976160988,
561
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6391041371971369,
562
+ "rewards/all-MiniLM-L6-v2": 0.10000000335276127,
563
+ "rewards/format_reward_beginEndSolutionFormat": 1.3624999970197678,
564
+ "rewards/length_reward": 0.35000001080334187,
565
+ "step": 74
566
+ },
567
+ {
568
+ "completion_length": 716.0625038146973,
569
+ "epoch": 0.05177993527508091,
570
+ "grad_norm": 1.0521018505096436,
571
+ "kl": 0.38922119140625,
572
+ "learning_rate": 1.666769000516292e-05,
573
+ "loss": 0.0156,
574
+ "reward": 3.2886771922931075,
575
+ "reward_std": 2.0216886228881776,
576
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7261771578341722,
577
+ "rewards/all-MiniLM-L6-v2": 0.17500000074505806,
578
+ "rewards/format_reward_beginEndSolutionFormat": 1.737500000745058,
579
+ "rewards/length_reward": 0.6500000106170774,
580
+ "step": 76
581
+ },
582
+ {
583
+ "completion_length": 835.2875137329102,
584
+ "epoch": 0.05314256515074093,
585
+ "grad_norm": 1.5486301183700562,
586
+ "kl": 0.5540771484375,
587
+ "learning_rate": 1.6427876096865394e-05,
588
+ "loss": 0.0222,
589
+ "reward": 2.288073843345046,
590
+ "reward_std": 2.585562598425895,
591
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6880737878382206,
592
+ "rewards/all-MiniLM-L6-v2": -0.16250000149011612,
593
+ "rewards/format_reward_beginEndSolutionFormat": 1.2875000014901161,
594
+ "rewards/length_reward": 0.47500000335276127,
595
+ "step": 78
596
+ },
597
+ {
598
+ "completion_length": 842.4875106811523,
599
+ "epoch": 0.05450519502640096,
600
+ "grad_norm": 0.7873005867004395,
601
+ "kl": 0.5556640625,
602
+ "learning_rate": 1.6181589862206053e-05,
603
+ "loss": 0.0222,
604
+ "reward": 2.895622299052775,
605
+ "reward_std": 2.1268671478610486,
606
+ "rewards/AnswerChecker_LenCheck_Reward": 0.570622275583446,
607
+ "rewards/all-MiniLM-L6-v2": 0.10000000428408384,
608
+ "rewards/format_reward_beginEndSolutionFormat": 1.7749999910593033,
609
+ "rewards/length_reward": 0.45000000298023224,
610
+ "step": 80
611
+ },
612
+ {
613
+ "completion_length": 866.9500141143799,
614
+ "epoch": 0.05586782490206098,
615
+ "grad_norm": 0.8428084254264832,
616
+ "kl": 0.599456787109375,
617
+ "learning_rate": 1.5929079290546408e-05,
618
+ "loss": 0.024,
619
+ "reward": 2.1291475765174255,
620
+ "reward_std": 2.3366958047263324,
621
+ "rewards/AnswerChecker_LenCheck_Reward": 0.5416475932579488,
622
+ "rewards/all-MiniLM-L6-v2": -0.3124999953433871,
623
+ "rewards/format_reward_beginEndSolutionFormat": 1.6249999925494194,
624
+ "rewards/length_reward": 0.27500000316649675,
625
+ "step": 82
626
+ },
627
+ {
628
+ "completion_length": 709.7125110626221,
629
+ "epoch": 0.057230454777721,
630
+ "grad_norm": 0.714391827583313,
631
+ "kl": 0.534912109375,
632
+ "learning_rate": 1.5670598638627707e-05,
633
+ "loss": 0.0214,
634
+ "reward": 2.692725880071521,
635
+ "reward_std": 2.575832322239876,
636
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6302258470095694,
637
+ "rewards/all-MiniLM-L6-v2": 0.10000000428408384,
638
+ "rewards/format_reward_beginEndSolutionFormat": 1.662499994970858,
639
+ "rewards/length_reward": 0.30000000447034836,
640
+ "step": 84
641
+ },
642
+ {
643
+ "completion_length": 653.9125118255615,
644
+ "epoch": 0.058593084653381027,
645
+ "grad_norm": 2.24059796333313,
646
+ "kl": 0.57843017578125,
647
+ "learning_rate": 1.5406408174555978e-05,
648
+ "loss": 0.0232,
649
+ "reward": 2.6488085128366947,
650
+ "reward_std": 2.0597222729120404,
651
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6988084595650434,
652
+ "rewards/all-MiniLM-L6-v2": -0.050000001676380634,
653
+ "rewards/format_reward_beginEndSolutionFormat": 1.6999999955296516,
654
+ "rewards/length_reward": 0.30000000540167093,
655
+ "step": 86
656
+ },
657
+ {
658
+ "completion_length": 530.4875144958496,
659
+ "epoch": 0.05995571452904105,
660
+ "grad_norm": 0.9869608283042908,
661
+ "kl": 0.4842529296875,
662
+ "learning_rate": 1.5136773915734067e-05,
663
+ "loss": 0.0194,
664
+ "reward": 3.0965824760496616,
665
+ "reward_std": 1.8650478832423687,
666
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7215823549777269,
667
+ "rewards/all-MiniLM-L6-v2": 0.3625000035390258,
668
+ "rewards/format_reward_beginEndSolutionFormat": 1.8124999925494194,
669
+ "rewards/length_reward": 0.2000000076368451,
670
+ "step": 88
671
+ },
672
+ {
673
+ "completion_length": 572.0000076293945,
674
+ "epoch": 0.061318344404701075,
675
+ "grad_norm": 0.7383394241333008,
676
+ "kl": 0.4207763671875,
677
+ "learning_rate": 1.4861967361004687e-05,
678
+ "loss": 0.0168,
679
+ "reward": 2.8489164412021637,
680
+ "reward_std": 2.041708583943546,
681
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6489164028316736,
682
+ "rewards/all-MiniLM-L6-v2": -0.04999999701976776,
683
+ "rewards/format_reward_beginEndSolutionFormat": 1.7749999947845936,
684
+ "rewards/length_reward": 0.4750000089406967,
685
+ "step": 90
686
+ },
687
+ {
688
+ "completion_length": 643.6000156402588,
689
+ "epoch": 0.0626809742803611,
690
+ "grad_norm": 0.7305729389190674,
691
+ "kl": 0.417236328125,
692
+ "learning_rate": 1.4582265217274105e-05,
693
+ "loss": 0.0167,
694
+ "reward": 3.1850010380148888,
695
+ "reward_std": 1.72700658114627,
696
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6975009143352509,
697
+ "rewards/all-MiniLM-L6-v2": 0.22499999962747097,
698
+ "rewards/format_reward_beginEndSolutionFormat": 1.6624999977648258,
699
+ "rewards/length_reward": 0.6000000089406967,
700
+ "step": 92
701
+ },
702
+ {
703
+ "completion_length": 843.2000122070312,
704
+ "epoch": 0.06404360415602112,
705
+ "grad_norm": 0.5814893841743469,
706
+ "kl": 0.44672393798828125,
707
+ "learning_rate": 1.4297949120891718e-05,
708
+ "loss": 0.0179,
709
+ "reward": 2.340944491326809,
710
+ "reward_std": 2.093850734643638,
711
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6159445056691766,
712
+ "rewards/all-MiniLM-L6-v2": -0.31249999813735485,
713
+ "rewards/format_reward_beginEndSolutionFormat": 1.5124999964609742,
714
+ "rewards/length_reward": 0.5250000022351742,
715
+ "step": 94
716
+ },
717
+ {
718
+ "completion_length": 767.9875068664551,
719
+ "epoch": 0.06540623403168115,
720
+ "grad_norm": 0.5642020106315613,
721
+ "kl": 0.41064453125,
722
+ "learning_rate": 1.4009305354066138e-05,
723
+ "loss": 0.0164,
724
+ "reward": 3.0814263820648193,
725
+ "reward_std": 2.0845829099416733,
726
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7064263448119164,
727
+ "rewards/all-MiniLM-L6-v2": 0.07499999925494194,
728
+ "rewards/format_reward_beginEndSolutionFormat": 1.625,
729
+ "rewards/length_reward": 0.6750000054016709,
730
+ "step": 96
731
+ },
732
+ {
733
+ "completion_length": 732.2500190734863,
734
+ "epoch": 0.06676886390734117,
735
+ "grad_norm": 0.6293222904205322,
736
+ "kl": 0.4149169921875,
737
+ "learning_rate": 1.3716624556603275e-05,
738
+ "loss": 0.0166,
739
+ "reward": 3.167923970380798,
740
+ "reward_std": 1.9076070357114077,
741
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6679238956421614,
742
+ "rewards/all-MiniLM-L6-v2": 0.12500000093132257,
743
+ "rewards/format_reward_beginEndSolutionFormat": 1.6999999955296516,
744
+ "rewards/length_reward": 0.6750000091269612,
745
+ "step": 98
746
+ },
747
+ {
748
+ "completion_length": 770.9750061035156,
749
+ "epoch": 0.06813149378300119,
750
+ "grad_norm": 0.608492910861969,
751
+ "kl": 0.38108062744140625,
752
+ "learning_rate": 1.342020143325669e-05,
753
+ "loss": 0.0153,
754
+ "reward": 2.9613163992762566,
755
+ "reward_std": 1.8579347440972924,
756
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6363163664937019,
757
+ "rewards/all-MiniLM-L6-v2": -0.049999997951090336,
758
+ "rewards/format_reward_beginEndSolutionFormat": 1.8499999977648258,
759
+ "rewards/length_reward": 0.5250000059604645,
760
+ "step": 100
761
+ },
762
+ {
763
+ "epoch": 0.06813149378300119,
764
+ "eval_completion_length": 719.5635530948639,
765
+ "eval_kl": 0.42165565490722656,
766
+ "eval_loss": 0.01686381734907627,
767
+ "eval_reward": 3.1516319193760864,
768
+ "eval_reward_std": 1.9167220215313137,
769
+ "eval_rewards/AnswerChecker_LenCheck_Reward": 0.6703818460227922,
770
+ "eval_rewards/all-MiniLM-L6-v2": 0.06250000081490725,
771
+ "eval_rewards/format_reward_beginEndSolutionFormat": 1.7374999963212758,
772
+ "eval_rewards/length_reward": 0.6812500089872628,
773
+ "eval_runtime": 1234.6422,
774
+ "eval_samples_per_second": 0.104,
775
+ "eval_steps_per_second": 0.021,
776
+ "step": 100
777
+ },
778
+ {
779
+ "completion_length": 708.6000080108643,
780
+ "epoch": 0.06949412365866121,
781
+ "grad_norm": 0.5848133563995361,
782
+ "kl": 0.41204833984375,
783
+ "learning_rate": 1.3120334456984871e-05,
784
+ "loss": 0.0165,
785
+ "reward": 3.301197201013565,
786
+ "reward_std": 2.1839559585787356,
787
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6011971952393651,
788
+ "rewards/all-MiniLM-L6-v2": 0.24999999906867743,
789
+ "rewards/format_reward_beginEndSolutionFormat": 1.699999988079071,
790
+ "rewards/length_reward": 0.7500000093132257,
791
+ "step": 102
792
+ },
793
+ {
794
+ "completion_length": 660.3000183105469,
795
+ "epoch": 0.07085675353432123,
796
+ "grad_norm": 1.3479467630386353,
797
+ "kl": 0.50341796875,
798
+ "learning_rate": 1.2817325568414299e-05,
799
+ "loss": 0.0201,
800
+ "reward": 2.9223585426807404,
801
+ "reward_std": 1.763994950801134,
802
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6473584100604057,
803
+ "rewards/all-MiniLM-L6-v2": -0.17499999701976776,
804
+ "rewards/format_reward_beginEndSolutionFormat": 1.7749999985098839,
805
+ "rewards/length_reward": 0.6750000081956387,
806
+ "step": 104
807
+ },
808
+ {
809
+ "completion_length": 684.8875122070312,
810
+ "epoch": 0.07221938340998127,
811
+ "grad_norm": 0.6432397365570068,
812
+ "kl": 0.3974609375,
813
+ "learning_rate": 1.2511479871810792e-05,
814
+ "loss": 0.0159,
815
+ "reward": 3.2463276791386306,
816
+ "reward_std": 1.709463362582028,
817
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6588275767862797,
818
+ "rewards/all-MiniLM-L6-v2": 0.02500000037252903,
819
+ "rewards/format_reward_beginEndSolutionFormat": 1.8124999925494194,
820
+ "rewards/length_reward": 0.7500000046566129,
821
+ "step": 106
822
+ },
823
+ {
824
+ "completion_length": 671.6875095367432,
825
+ "epoch": 0.07358201328564129,
826
+ "grad_norm": 0.6950499415397644,
827
+ "kl": 0.41790771484375,
828
+ "learning_rate": 1.2203105327865407e-05,
829
+ "loss": 0.0167,
830
+ "reward": 3.2045797668397427,
831
+ "reward_std": 1.9101572453510016,
832
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7170797474682331,
833
+ "rewards/all-MiniLM-L6-v2": 0.14999999944120646,
834
+ "rewards/format_reward_beginEndSolutionFormat": 1.6624999986961484,
835
+ "rewards/length_reward": 0.6750000072643161,
836
+ "step": 108
837
+ },
838
+ {
839
+ "completion_length": 808.1000099182129,
840
+ "epoch": 0.07494464316130131,
841
+ "grad_norm": 0.5721182227134705,
842
+ "kl": 0.511962890625,
843
+ "learning_rate": 1.1892512443604103e-05,
844
+ "loss": 0.0205,
845
+ "reward": 2.2793450676836073,
846
+ "reward_std": 2.890487961471081,
847
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6168450396507978,
848
+ "rewards/all-MiniLM-L6-v2": -0.3124999972060323,
849
+ "rewards/format_reward_beginEndSolutionFormat": 1.5499999970197678,
850
+ "rewards/length_reward": 0.4250000100582838,
851
+ "step": 110
852
+ },
853
+ {
854
+ "completion_length": 693.9000129699707,
855
+ "epoch": 0.07630727303696133,
856
+ "grad_norm": 0.5843067765235901,
857
+ "kl": 0.41845703125,
858
+ "learning_rate": 1.15800139597335e-05,
859
+ "loss": 0.0167,
860
+ "reward": 3.066192142665386,
861
+ "reward_std": 2.348536633886397,
862
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6661920826882124,
863
+ "rewards/all-MiniLM-L6-v2": -0.01249999925494194,
864
+ "rewards/format_reward_beginEndSolutionFormat": 1.7374999895691872,
865
+ "rewards/length_reward": 0.675000011920929,
866
+ "step": 112
867
+ },
868
+ {
869
+ "completion_length": 868.7250137329102,
870
+ "epoch": 0.07766990291262135,
871
+ "grad_norm": 0.5749533772468567,
872
+ "kl": 0.63037109375,
873
+ "learning_rate": 1.1265924535737494e-05,
874
+ "loss": 0.0252,
875
+ "reward": 1.36430923640728,
876
+ "reward_std": 3.1500269323587418,
877
+ "rewards/AnswerChecker_LenCheck_Reward": 0.514309162274003,
878
+ "rewards/all-MiniLM-L6-v2": -0.6499999975785613,
879
+ "rewards/format_reward_beginEndSolutionFormat": 1.0999999940395355,
880
+ "rewards/length_reward": 0.4000000050291419,
881
+ "step": 114
882
+ },
883
+ {
884
+ "completion_length": 725.7375068664551,
885
+ "epoch": 0.07903253278828139,
886
+ "grad_norm": 0.5346440672874451,
887
+ "kl": 0.4619140625,
888
+ "learning_rate": 1.0950560433041825e-05,
889
+ "loss": 0.0185,
890
+ "reward": 2.963149979710579,
891
+ "reward_std": 2.0242148893885314,
892
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6506498479284346,
893
+ "rewards/all-MiniLM-L6-v2": -0.16249999776482582,
894
+ "rewards/format_reward_beginEndSolutionFormat": 1.8499999940395355,
895
+ "rewards/length_reward": 0.6250000093132257,
896
+ "step": 116
897
+ },
898
+ {
899
+ "completion_length": 773.3750152587891,
900
+ "epoch": 0.08039516266394141,
901
+ "grad_norm": 0.6365153193473816,
902
+ "kl": 0.4718017578125,
903
+ "learning_rate": 1.0634239196565646e-05,
904
+ "loss": 0.0189,
905
+ "reward": 2.2029636511579156,
906
+ "reward_std": 2.7039426697883755,
907
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6029636058956385,
908
+ "rewards/all-MiniLM-L6-v2": -0.3499999986961484,
909
+ "rewards/format_reward_beginEndSolutionFormat": 1.474999993108213,
910
+ "rewards/length_reward": 0.4750000098720193,
911
+ "step": 118
912
+ },
913
+ {
914
+ "completion_length": 775.962516784668,
915
+ "epoch": 0.08175779253960143,
916
+ "grad_norm": 0.5627033710479736,
917
+ "kl": 0.5267333984375,
918
+ "learning_rate": 1.031727933498068e-05,
919
+ "loss": 0.0211,
920
+ "reward": 2.5474718529731035,
921
+ "reward_std": 2.10833885287866,
922
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6349717965349555,
923
+ "rewards/all-MiniLM-L6-v2": -0.27499999944120646,
924
+ "rewards/format_reward_beginEndSolutionFormat": 1.6624999940395355,
925
+ "rewards/length_reward": 0.5250000078231096,
926
+ "step": 120
927
+ },
928
+ {
929
+ "completion_length": 641.2125091552734,
930
+ "epoch": 0.08312042241526145,
931
+ "grad_norm": 0.5676293969154358,
932
+ "kl": 0.413330078125,
933
+ "learning_rate": 1e-05,
934
+ "loss": 0.0165,
935
+ "reward": 3.1914120875298977,
936
+ "reward_std": 1.7586256829090416,
937
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7289120629429817,
938
+ "rewards/all-MiniLM-L6-v2": -0.11249999981373549,
939
+ "rewards/format_reward_beginEndSolutionFormat": 1.8499999940395355,
940
+ "rewards/length_reward": 0.7250000108033419,
941
+ "step": 122
942
+ },
943
+ {
944
+ "completion_length": 708.8875179290771,
945
+ "epoch": 0.08448305229092148,
946
+ "grad_norm": 0.6845495700836182,
947
+ "kl": 0.5447998046875,
948
+ "learning_rate": 9.682720665019325e-06,
949
+ "loss": 0.0218,
950
+ "reward": 2.7032833639532328,
951
+ "reward_std": 2.0250429082661867,
952
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6782832983881235,
953
+ "rewards/all-MiniLM-L6-v2": -0.125,
954
+ "rewards/format_reward_beginEndSolutionFormat": 1.6249999962747097,
955
+ "rewards/length_reward": 0.5250000078231096,
956
+ "step": 124
957
+ },
958
+ {
959
+ "completion_length": 693.4625110626221,
960
+ "epoch": 0.0858456821665815,
961
+ "grad_norm": 0.7195069193840027,
962
+ "kl": 0.4825439453125,
963
+ "learning_rate": 9.365760803434356e-06,
964
+ "loss": 0.0193,
965
+ "reward": 2.8621206106618047,
966
+ "reward_std": 2.122805818915367,
967
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6871205642819405,
968
+ "rewards/all-MiniLM-L6-v2": -0.19999999832361937,
969
+ "rewards/format_reward_beginEndSolutionFormat": 1.7749999947845936,
970
+ "rewards/length_reward": 0.6000000080093741,
971
+ "step": 126
972
+ },
973
+ {
974
+ "completion_length": 716.3250122070312,
975
+ "epoch": 0.08720831204224153,
976
+ "grad_norm": 0.678680419921875,
977
+ "kl": 0.4881591796875,
978
+ "learning_rate": 9.049439566958176e-06,
979
+ "loss": 0.0195,
980
+ "reward": 3.0072994977235794,
981
+ "reward_std": 2.460042350925505,
982
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6822993978857994,
983
+ "rewards/all-MiniLM-L6-v2": -0.012499997392296791,
984
+ "rewards/format_reward_beginEndSolutionFormat": 1.6624999903142452,
985
+ "rewards/length_reward": 0.6750000109896064,
986
+ "step": 128
987
+ },
988
+ {
989
+ "completion_length": 656.1500129699707,
990
+ "epoch": 0.08857094191790155,
991
+ "grad_norm": 0.49265533685684204,
992
+ "kl": 0.39990234375,
993
+ "learning_rate": 8.734075464262507e-06,
994
+ "loss": 0.016,
995
+ "reward": 3.324027754366398,
996
+ "reward_std": 1.9725347934290767,
997
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7615277506411076,
998
+ "rewards/all-MiniLM-L6-v2": 1.862645149230957e-09,
999
+ "rewards/format_reward_beginEndSolutionFormat": 1.7374999932944775,
1000
+ "rewards/length_reward": 0.8250000104308128,
1001
+ "step": 130
1002
+ },
1003
+ {
1004
+ "completion_length": 780.5625152587891,
1005
+ "epoch": 0.08993357179356157,
1006
+ "grad_norm": 0.5680932402610779,
1007
+ "kl": 0.452880859375,
1008
+ "learning_rate": 8.419986040266502e-06,
1009
+ "loss": 0.0181,
1010
+ "reward": 3.1687414944171906,
1011
+ "reward_std": 2.452119631692767,
1012
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6687414180487394,
1013
+ "rewards/all-MiniLM-L6-v2": 0.25,
1014
+ "rewards/format_reward_beginEndSolutionFormat": 1.624999988824129,
1015
+ "rewards/length_reward": 0.625000006519258,
1016
+ "step": 132
1017
+ },
1018
+ {
1019
+ "completion_length": 787.3375110626221,
1020
+ "epoch": 0.0912962016692216,
1021
+ "grad_norm": 0.5462779998779297,
1022
+ "kl": 0.43536376953125,
1023
+ "learning_rate": 8.107487556395902e-06,
1024
+ "loss": 0.0174,
1025
+ "reward": 2.4839657694101334,
1026
+ "reward_std": 2.853791818022728,
1027
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6589657235890627,
1028
+ "rewards/all-MiniLM-L6-v2": -0.2625000011175871,
1029
+ "rewards/format_reward_beginEndSolutionFormat": 1.5124999964609742,
1030
+ "rewards/length_reward": 0.5750000094994903,
1031
+ "step": 134
1032
+ },
1033
+ {
1034
+ "completion_length": 820.3000240325928,
1035
+ "epoch": 0.09265883154488162,
1036
+ "grad_norm": 0.5613769292831421,
1037
+ "kl": 0.524169921875,
1038
+ "learning_rate": 7.796894672134594e-06,
1039
+ "loss": 0.021,
1040
+ "reward": 1.7514715013094246,
1041
+ "reward_std": 3.1442014798521996,
1042
+ "rewards/AnswerChecker_LenCheck_Reward": 0.613971471786499,
1043
+ "rewards/all-MiniLM-L6-v2": -0.46250000037252903,
1044
+ "rewards/format_reward_beginEndSolutionFormat": 1.1749999951571226,
1045
+ "rewards/length_reward": 0.42500001192092896,
1046
+ "step": 136
1047
+ },
1048
+ {
1049
+ "completion_length": 772.2000160217285,
1050
+ "epoch": 0.09402146142054164,
1051
+ "grad_norm": 0.5531837940216064,
1052
+ "kl": 0.4697265625,
1053
+ "learning_rate": 7.488520128189209e-06,
1054
+ "loss": 0.0188,
1055
+ "reward": 2.512004946358502,
1056
+ "reward_std": 2.8804666833020747,
1057
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6370048951357603,
1058
+ "rewards/all-MiniLM-L6-v2": -0.16249999962747097,
1059
+ "rewards/format_reward_beginEndSolutionFormat": 1.4374999897554517,
1060
+ "rewards/length_reward": 0.6000000163912773,
1061
+ "step": 138
1062
+ },
1063
+ {
1064
+ "completion_length": 798.4750156402588,
1065
+ "epoch": 0.09538409129620166,
1066
+ "grad_norm": 0.5136257410049438,
1067
+ "kl": 0.46234130859375,
1068
+ "learning_rate": 7.182674431585703e-06,
1069
+ "loss": 0.0185,
1070
+ "reward": 2.3262645918875933,
1071
+ "reward_std": 3.08097755163908,
1072
+ "rewards/AnswerChecker_LenCheck_Reward": 0.5512645365670323,
1073
+ "rewards/all-MiniLM-L6-v2": -0.16250000055879354,
1074
+ "rewards/format_reward_beginEndSolutionFormat": 1.4374999944120646,
1075
+ "rewards/length_reward": 0.5000000121071935,
1076
+ "step": 140
1077
+ },
1078
+ {
1079
+ "completion_length": 826.4000129699707,
1080
+ "epoch": 0.0967467211718617,
1081
+ "grad_norm": 0.5762442946434021,
1082
+ "kl": 0.52813720703125,
1083
+ "learning_rate": 6.87966554301513e-06,
1084
+ "loss": 0.0211,
1085
+ "reward": 1.824558557709679,
1086
+ "reward_std": 2.8447077772580087,
1087
+ "rewards/AnswerChecker_LenCheck_Reward": 0.5995585434138775,
1088
+ "rewards/all-MiniLM-L6-v2": -0.36249999422580004,
1089
+ "rewards/format_reward_beginEndSolutionFormat": 1.2124999975785613,
1090
+ "rewards/length_reward": 0.3750000046566129,
1091
+ "step": 142
1092
+ },
1093
+ {
1094
+ "completion_length": 750.8875217437744,
1095
+ "epoch": 0.09810935104752172,
1096
+ "grad_norm": 0.6162233948707581,
1097
+ "kl": 0.4547119140625,
1098
+ "learning_rate": 6.579798566743314e-06,
1099
+ "loss": 0.0182,
1100
+ "reward": 2.420007437467575,
1101
+ "reward_std": 2.7739858217537403,
1102
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6075074281543493,
1103
+ "rewards/all-MiniLM-L6-v2": -0.19999999832361937,
1104
+ "rewards/format_reward_beginEndSolutionFormat": 1.437499993480742,
1105
+ "rewards/length_reward": 0.575000012293458,
1106
+ "step": 144
1107
+ },
1108
+ {
1109
+ "completion_length": 734.2625026702881,
1110
+ "epoch": 0.09947198092318174,
1111
+ "grad_norm": 0.5969240069389343,
1112
+ "kl": 0.5074462890625,
1113
+ "learning_rate": 6.283375443396726e-06,
1114
+ "loss": 0.0203,
1115
+ "reward": 2.7079888563603163,
1116
+ "reward_std": 2.7715829075314105,
1117
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6204887935891747,
1118
+ "rewards/all-MiniLM-L6-v2": 0.11249999888241291,
1119
+ "rewards/format_reward_beginEndSolutionFormat": 1.3999999957159162,
1120
+ "rewards/length_reward": 0.5750000076368451,
1121
+ "step": 146
1122
+ },
1123
+ {
1124
+ "completion_length": 684.1750106811523,
1125
+ "epoch": 0.10083461079884176,
1126
+ "grad_norm": 0.6124630570411682,
1127
+ "kl": 0.4541015625,
1128
+ "learning_rate": 5.990694645933866e-06,
1129
+ "loss": 0.0182,
1130
+ "reward": 2.8833600133657455,
1131
+ "reward_std": 2.8265584837645292,
1132
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6833599451929331,
1133
+ "rewards/all-MiniLM-L6-v2": 0.21249999944120646,
1134
+ "rewards/format_reward_beginEndSolutionFormat": 1.437499993480742,
1135
+ "rewards/length_reward": 0.550000011920929,
1136
+ "step": 148
1137
+ },
1138
+ {
1139
+ "completion_length": 623.0000114440918,
1140
+ "epoch": 0.10219724067450178,
1141
+ "grad_norm": 0.6724729537963867,
1142
+ "kl": 0.4202880859375,
1143
+ "learning_rate": 5.702050879108284e-06,
1144
+ "loss": 0.0168,
1145
+ "reward": 3.6663911901414394,
1146
+ "reward_std": 1.9338009762577713,
1147
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7163911033421755,
1148
+ "rewards/all-MiniLM-L6-v2": 0.4750000014901161,
1149
+ "rewards/format_reward_beginEndSolutionFormat": 1.699999992735684,
1150
+ "rewards/length_reward": 0.7750000106170774,
1151
+ "step": 150
1152
+ },
1153
+ {
1154
+ "completion_length": 674.3875122070312,
1155
+ "epoch": 0.10355987055016182,
1156
+ "grad_norm": 0.5944858193397522,
1157
+ "kl": 0.4415283203125,
1158
+ "learning_rate": 5.417734782725896e-06,
1159
+ "loss": 0.0177,
1160
+ "reward": 3.1010947674512863,
1161
+ "reward_std": 2.580965321045369,
1162
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7135947253555059,
1163
+ "rewards/all-MiniLM-L6-v2": 0.28749999962747097,
1164
+ "rewards/format_reward_beginEndSolutionFormat": 1.5499999932944775,
1165
+ "rewards/length_reward": 0.5500000081956387,
1166
+ "step": 152
1167
+ },
1168
+ {
1169
+ "completion_length": 717.3375110626221,
1170
+ "epoch": 0.10492250042582184,
1171
+ "grad_norm": 0.6234627366065979,
1172
+ "kl": 0.489990234375,
1173
+ "learning_rate": 5.138032638995315e-06,
1174
+ "loss": 0.0196,
1175
+ "reward": 2.735639053862542,
1176
+ "reward_std": 2.6947292862460017,
1177
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6606389936059713,
1178
+ "rewards/all-MiniLM-L6-v2": 0.062499999068677425,
1179
+ "rewards/format_reward_beginEndSolutionFormat": 1.5874999957159162,
1180
+ "rewards/length_reward": 0.4250000072643161,
1181
+ "step": 154
1182
+ },
1183
+ {
1184
+ "completion_length": 629.3750095367432,
1185
+ "epoch": 0.10628513030148186,
1186
+ "grad_norm": 0.6413260102272034,
1187
+ "kl": 0.4696044921875,
1188
+ "learning_rate": 4.863226084265939e-06,
1189
+ "loss": 0.0188,
1190
+ "reward": 3.1933979094028473,
1191
+ "reward_std": 2.0305717810988426,
1192
+ "rewards/AnswerChecker_LenCheck_Reward": 0.71839783154428,
1193
+ "rewards/all-MiniLM-L6-v2": 0.2875000024214387,
1194
+ "rewards/format_reward_beginEndSolutionFormat": 1.5874999929219484,
1195
+ "rewards/length_reward": 0.6000000080093741,
1196
+ "step": 156
1197
+ },
1198
+ {
1199
+ "completion_length": 742.8625106811523,
1200
+ "epoch": 0.10764776017714188,
1201
+ "grad_norm": 0.5482673645019531,
1202
+ "kl": 0.4775390625,
1203
+ "learning_rate": 4.593591825444028e-06,
1204
+ "loss": 0.0191,
1205
+ "reward": 2.9363540150225163,
1206
+ "reward_std": 2.4036293793469667,
1207
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6613539699465036,
1208
+ "rewards/all-MiniLM-L6-v2": 0.1000000024214387,
1209
+ "rewards/format_reward_beginEndSolutionFormat": 1.6249999897554517,
1210
+ "rewards/length_reward": 0.5500000109896064,
1211
+ "step": 158
1212
+ },
1213
+ {
1214
+ "completion_length": 828.2000198364258,
1215
+ "epoch": 0.10901039005280191,
1216
+ "grad_norm": 0.534371018409729,
1217
+ "kl": 0.58740234375,
1218
+ "learning_rate": 4.3294013613722944e-06,
1219
+ "loss": 0.0235,
1220
+ "reward": 2.148466292768717,
1221
+ "reward_std": 2.682615263853222,
1222
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6359662003815174,
1223
+ "rewards/all-MiniLM-L6-v2": -0.34999999962747097,
1224
+ "rewards/format_reward_beginEndSolutionFormat": 1.4374999962747097,
1225
+ "rewards/length_reward": 0.42500000540167093,
1226
+ "step": 160
1227
+ },
1228
+ {
1229
+ "completion_length": 742.5875148773193,
1230
+ "epoch": 0.11037301992846194,
1231
+ "grad_norm": 0.5626158714294434,
1232
+ "kl": 0.444580078125,
1233
+ "learning_rate": 4.070920709453597e-06,
1234
+ "loss": 0.0178,
1235
+ "reward": 2.9018475273624063,
1236
+ "reward_std": 2.3305454067885876,
1237
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7143475078046322,
1238
+ "rewards/all-MiniLM-L6-v2": -0.08750000037252903,
1239
+ "rewards/format_reward_beginEndSolutionFormat": 1.6249999962747097,
1240
+ "rewards/length_reward": 0.650000006891787,
1241
+ "step": 162
1242
+ },
1243
+ {
1244
+ "completion_length": 766.9125213623047,
1245
+ "epoch": 0.11173564980412196,
1246
+ "grad_norm": 0.5314887762069702,
1247
+ "kl": 0.4736328125,
1248
+ "learning_rate": 3.818410137793947e-06,
1249
+ "loss": 0.019,
1250
+ "reward": 2.8508432680973783,
1251
+ "reward_std": 2.2408911171369255,
1252
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7008432727307081,
1253
+ "rewards/all-MiniLM-L6-v2": -0.08749999944120646,
1254
+ "rewards/format_reward_beginEndSolutionFormat": 1.5874999929219484,
1255
+ "rewards/length_reward": 0.6500000124797225,
1256
+ "step": 164
1257
+ },
1258
+ {
1259
+ "completion_length": 800.5875053405762,
1260
+ "epoch": 0.11309827967978198,
1261
+ "grad_norm": 0.6379206776618958,
1262
+ "kl": 0.4525146484375,
1263
+ "learning_rate": 3.5721239031346067e-06,
1264
+ "loss": 0.0181,
1265
+ "reward": 2.8181837666779757,
1266
+ "reward_std": 1.9853734835051,
1267
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7431836733594537,
1268
+ "rewards/all-MiniLM-L6-v2": -0.07500000111758709,
1269
+ "rewards/format_reward_beginEndSolutionFormat": 1.549999998882413,
1270
+ "rewards/length_reward": 0.6000000042840838,
1271
+ "step": 166
1272
+ },
1273
+ {
1274
+ "completion_length": 709.2625160217285,
1275
+ "epoch": 0.114460909555442,
1276
+ "grad_norm": 0.596421480178833,
1277
+ "kl": 0.461669921875,
1278
+ "learning_rate": 3.3323099948370853e-06,
1279
+ "loss": 0.0185,
1280
+ "reward": 3.579191967844963,
1281
+ "reward_std": 1.6637917342595756,
1282
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7166919354349375,
1283
+ "rewards/all-MiniLM-L6-v2": 0.32499999925494194,
1284
+ "rewards/format_reward_beginEndSolutionFormat": 1.7374999932944775,
1285
+ "rewards/length_reward": 0.8000000063329935,
1286
+ "step": 168
1287
+ },
1288
+ {
1289
+ "completion_length": 778.4750156402588,
1290
+ "epoch": 0.11582353943110203,
1291
+ "grad_norm": 0.5588167309761047,
1292
+ "kl": 0.4560546875,
1293
+ "learning_rate": 3.099209885178882e-06,
1294
+ "loss": 0.0182,
1295
+ "reward": 2.6490836143493652,
1296
+ "reward_std": 2.552674914477393,
1297
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7615835480391979,
1298
+ "rewards/all-MiniLM-L6-v2": -0.18749999813735485,
1299
+ "rewards/format_reward_beginEndSolutionFormat": 1.4749999921768904,
1300
+ "rewards/length_reward": 0.6000000108033419,
1301
+ "step": 170
1302
+ },
1303
+ {
1304
+ "completion_length": 792.4125118255615,
1305
+ "epoch": 0.11718616930676205,
1306
+ "grad_norm": 0.5151498317718506,
1307
+ "kl": 0.47119140625,
1308
+ "learning_rate": 2.8730582862113743e-06,
1309
+ "loss": 0.0188,
1310
+ "reward": 2.72108455048874,
1311
+ "reward_std": 2.322652825154364,
1312
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7460845243185759,
1313
+ "rewards/all-MiniLM-L6-v2": -0.27499999944120646,
1314
+ "rewards/format_reward_beginEndSolutionFormat": 1.6249999925494194,
1315
+ "rewards/length_reward": 0.6250000139698386,
1316
+ "step": 172
1317
+ },
1318
+ {
1319
+ "completion_length": 821.4250068664551,
1320
+ "epoch": 0.11854879918242207,
1321
+ "grad_norm": 0.5205643773078918,
1322
+ "kl": 0.52734375,
1323
+ "learning_rate": 2.6540829134246683e-06,
1324
+ "loss": 0.0211,
1325
+ "reward": 2.610885198402684,
1326
+ "reward_std": 3.1788224331103265,
1327
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6858852654695511,
1328
+ "rewards/all-MiniLM-L6-v2": -0.01249999925494194,
1329
+ "rewards/format_reward_beginEndSolutionFormat": 1.3624999867752194,
1330
+ "rewards/length_reward": 0.5750000169500709,
1331
+ "step": 174
1332
+ },
1333
+ {
1334
+ "completion_length": 837.5875053405762,
1335
+ "epoch": 0.1199114290580821,
1336
+ "grad_norm": 0.5888510942459106,
1337
+ "kl": 0.466461181640625,
1338
+ "learning_rate": 2.4425042564574186e-06,
1339
+ "loss": 0.0187,
1340
+ "reward": 2.845615219324827,
1341
+ "reward_std": 2.686575308907777,
1342
+ "rewards/AnswerChecker_LenCheck_Reward": 0.708115229383111,
1343
+ "rewards/all-MiniLM-L6-v2": 0.1000000024214387,
1344
+ "rewards/format_reward_beginEndSolutionFormat": 1.4374999897554517,
1345
+ "rewards/length_reward": 0.6000000108033419,
1346
+ "step": 176
1347
+ },
1348
+ {
1349
+ "completion_length": 891.9000205993652,
1350
+ "epoch": 0.12127405893374212,
1351
+ "grad_norm": 0.5735381841659546,
1352
+ "kl": 0.5396728515625,
1353
+ "learning_rate": 2.2385353570824308e-06,
1354
+ "loss": 0.0216,
1355
+ "reward": 2.17099030315876,
1356
+ "reward_std": 2.5423294971697032,
1357
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6709902733564377,
1358
+ "rewards/all-MiniLM-L6-v2": -0.2999999998137355,
1359
+ "rewards/format_reward_beginEndSolutionFormat": 1.3249999964609742,
1360
+ "rewards/length_reward": 0.47500000707805157,
1361
+ "step": 178
1362
+ },
1363
+ {
1364
+ "completion_length": 876.9625091552734,
1365
+ "epoch": 0.12263668880940215,
1366
+ "grad_norm": 0.4231950342655182,
1367
+ "kl": 0.4525146484375,
1368
+ "learning_rate": 2.0423815946916783e-06,
1369
+ "loss": 0.0181,
1370
+ "reward": 2.40277567692101,
1371
+ "reward_std": 2.6864251531660557,
1372
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6652756258845329,
1373
+ "rewards/all-MiniLM-L6-v2": -0.38749999925494194,
1374
+ "rewards/format_reward_beginEndSolutionFormat": 1.5499999932944775,
1375
+ "rewards/length_reward": 0.5750000150874257,
1376
+ "step": 180
1377
+ },
1378
+ {
1379
+ "completion_length": 771.4625129699707,
1380
+ "epoch": 0.12399931868506217,
1381
+ "grad_norm": 0.6879674196243286,
1382
+ "kl": 0.4241943359375,
1383
+ "learning_rate": 1.854240479496643e-06,
1384
+ "loss": 0.017,
1385
+ "reward": 2.696409245952964,
1386
+ "reward_std": 2.6794386385008693,
1387
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7464092001318932,
1388
+ "rewards/all-MiniLM-L6-v2": -0.16249999962747097,
1389
+ "rewards/format_reward_beginEndSolutionFormat": 1.512499999254942,
1390
+ "rewards/length_reward": 0.600000006146729,
1391
+ "step": 182
1392
+ },
1393
+ {
1394
+ "completion_length": 856.8250198364258,
1395
+ "epoch": 0.1253619485607222,
1396
+ "grad_norm": 0.5000143051147461,
1397
+ "kl": 0.489013671875,
1398
+ "learning_rate": 1.6743014536522872e-06,
1399
+ "loss": 0.0196,
1400
+ "reward": 2.656417434802279,
1401
+ "reward_std": 2.805582005530596,
1402
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6689173814374954,
1403
+ "rewards/all-MiniLM-L6-v2": 0.10000000055879354,
1404
+ "rewards/format_reward_beginEndSolutionFormat": 1.2874999940395355,
1405
+ "rewards/length_reward": 0.6000000108033419,
1406
+ "step": 184
1407
+ },
1408
+ {
1409
+ "completion_length": 966.1875114440918,
1410
+ "epoch": 0.1267245784363822,
1411
+ "grad_norm": 0.5162502527236938,
1412
+ "kl": 0.565032958984375,
1413
+ "learning_rate": 1.5027457005048573e-06,
1414
+ "loss": 0.0226,
1415
+ "reward": 2.036801053909585,
1416
+ "reward_std": 2.7416782565414906,
1417
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6243010433390737,
1418
+ "rewards/all-MiniLM-L6-v2": -0.16250000149011612,
1419
+ "rewards/format_reward_beginEndSolutionFormat": 1.1749999970197678,
1420
+ "rewards/length_reward": 0.4000000013038516,
1421
+ "step": 186
1422
+ },
1423
+ {
1424
+ "completion_length": 861.5000190734863,
1425
+ "epoch": 0.12808720831204223,
1426
+ "grad_norm": 0.5157842040061951,
1427
+ "kl": 0.493896484375,
1428
+ "learning_rate": 1.339745962155613e-06,
1429
+ "loss": 0.0197,
1430
+ "reward": 2.1817345349118114,
1431
+ "reward_std": 2.4753956105560064,
1432
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6442344710230827,
1433
+ "rewards/all-MiniLM-L6-v2": -0.49999999813735485,
1434
+ "rewards/format_reward_beginEndSolutionFormat": 1.5124999918043613,
1435
+ "rewards/length_reward": 0.525000006891787,
1436
+ "step": 188
1437
+ },
1438
+ {
1439
+ "completion_length": 839.6875190734863,
1440
+ "epoch": 0.12944983818770225,
1441
+ "grad_norm": 0.4954194128513336,
1442
+ "kl": 0.483642578125,
1443
+ "learning_rate": 1.1854663655241804e-06,
1444
+ "loss": 0.0193,
1445
+ "reward": 2.925493508577347,
1446
+ "reward_std": 2.9417885206639767,
1447
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7129934653639793,
1448
+ "rewards/all-MiniLM-L6-v2": 0.06250000186264515,
1449
+ "rewards/format_reward_beginEndSolutionFormat": 1.4749999903142452,
1450
+ "rewards/length_reward": 0.6750000109896064,
1451
+ "step": 190
1452
+ },
1453
+ {
1454
+ "completion_length": 824.5375061035156,
1455
+ "epoch": 0.1308124680633623,
1456
+ "grad_norm": 0.4791363477706909,
1457
+ "kl": 0.4300537109375,
1458
+ "learning_rate": 1.0400622570866426e-06,
1459
+ "loss": 0.0172,
1460
+ "reward": 2.805993139743805,
1461
+ "reward_std": 2.586992080323398,
1462
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7059930665418506,
1463
+ "rewards/all-MiniLM-L6-v2": -0.050000001676380634,
1464
+ "rewards/format_reward_beginEndSolutionFormat": 1.5499999932944775,
1465
+ "rewards/length_reward": 0.6000000108033419,
1466
+ "step": 192
1467
+ },
1468
+ {
1469
+ "completion_length": 923.1625137329102,
1470
+ "epoch": 0.13217509793902232,
1471
+ "grad_norm": 0.40569427609443665,
1472
+ "kl": 0.61822509765625,
1473
+ "learning_rate": 9.036800464548157e-07,
1474
+ "loss": 0.0247,
1475
+ "reward": 2.090137917548418,
1476
+ "reward_std": 3.2905192878097296,
1477
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6651378944516182,
1478
+ "rewards/all-MiniLM-L6-v2": -0.23750000447034836,
1479
+ "rewards/format_reward_beginEndSolutionFormat": 1.2124999891966581,
1480
+ "rewards/length_reward": 0.45000001043081284,
1481
+ "step": 194
1482
+ },
1483
+ {
1484
+ "completion_length": 860.8375129699707,
1485
+ "epoch": 0.13353772781468234,
1486
+ "grad_norm": 0.49536529183387756,
1487
+ "kl": 0.450927734375,
1488
+ "learning_rate": 7.764570589541876e-07,
1489
+ "loss": 0.018,
1490
+ "reward": 2.6283551678061485,
1491
+ "reward_std": 2.2464691884815693,
1492
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7033550683408976,
1493
+ "rewards/all-MiniLM-L6-v2": -0.23749999701976776,
1494
+ "rewards/format_reward_beginEndSolutionFormat": 1.5874999919906259,
1495
+ "rewards/length_reward": 0.5750000067055225,
1496
+ "step": 196
1497
+ },
1498
+ {
1499
+ "completion_length": 886.8375091552734,
1500
+ "epoch": 0.13490035769034237,
1501
+ "grad_norm": 0.5045952200889587,
1502
+ "kl": 0.5023193359375,
1503
+ "learning_rate": 6.585213973489335e-07,
1504
+ "loss": 0.0201,
1505
+ "reward": 2.5980349611490965,
1506
+ "reward_std": 2.6176577494479716,
1507
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6730349091812968,
1508
+ "rewards/all-MiniLM-L6-v2": -0.125,
1509
+ "rewards/format_reward_beginEndSolutionFormat": 1.474999994970858,
1510
+ "rewards/length_reward": 0.5750000094994903,
1511
+ "step": 198
1512
+ },
1513
+ {
1514
+ "completion_length": 821.5750122070312,
1515
+ "epoch": 0.13626298756600239,
1516
+ "grad_norm": 0.5383718609809875,
1517
+ "kl": 0.44384765625,
1518
+ "learning_rate": 5.499918128533155e-07,
1519
+ "loss": 0.0178,
1520
+ "reward": 2.671635969541967,
1521
+ "reward_std": 2.3409718545153737,
1522
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7591358739882708,
1523
+ "rewards/all-MiniLM-L6-v2": -0.3124999990686774,
1524
+ "rewards/format_reward_beginEndSolutionFormat": 1.6249999972060323,
1525
+ "rewards/length_reward": 0.600000012665987,
1526
+ "step": 200
1527
+ },
1528
+ {
1529
+ "epoch": 0.13626298756600239,
1530
+ "eval_completion_length": 901.0770969390869,
1531
+ "eval_kl": 0.51776123046875,
1532
+ "eval_loss": 0.020734939724206924,
1533
+ "eval_reward": 2.2421040004119277,
1534
+ "eval_reward_std": 3.041678664914798,
1535
+ "eval_rewards/AnswerChecker_LenCheck_Reward": 0.6764789510052651,
1536
+ "eval_rewards/all-MiniLM-L6-v2": -0.21718749997671694,
1537
+ "eval_rewards/format_reward_beginEndSolutionFormat": 1.2921874948078766,
1538
+ "eval_rewards/length_reward": 0.4906250089406967,
1539
+ "eval_runtime": 1491.0997,
1540
+ "eval_samples_per_second": 0.086,
1541
+ "eval_steps_per_second": 0.017,
1542
+ "step": 200
1543
+ },
1544
+ {
1545
+ "completion_length": 868.400016784668,
1546
+ "epoch": 0.1376256174416624,
1547
+ "grad_norm": 0.5225978493690491,
1548
+ "kl": 0.470703125,
1549
+ "learning_rate": 4.509775855592613e-07,
1550
+ "loss": 0.0188,
1551
+ "reward": 2.6535421311855316,
1552
+ "reward_std": 2.792340838816017,
1553
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6660420242697,
1554
+ "rewards/all-MiniLM-L6-v2": 0.08750000316649675,
1555
+ "rewards/format_reward_beginEndSolutionFormat": 1.3249999936670065,
1556
+ "rewards/length_reward": 0.5750000094994903,
1557
+ "step": 202
1558
+ },
1559
+ {
1560
+ "completion_length": 866.7875213623047,
1561
+ "epoch": 0.13898824731732243,
1562
+ "grad_norm": 0.4848329722881317,
1563
+ "kl": 0.485107421875,
1564
+ "learning_rate": 3.615784144005796e-07,
1565
+ "loss": 0.0194,
1566
+ "reward": 2.717348374426365,
1567
+ "reward_std": 3.1343387353699654,
1568
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7048483230173588,
1569
+ "rewards/all-MiniLM-L6-v2": 0.06250000186264515,
1570
+ "rewards/format_reward_beginEndSolutionFormat": 1.3999999919906259,
1571
+ "rewards/length_reward": 0.550000011920929,
1572
+ "step": 204
1573
+ },
1574
+ {
1575
+ "completion_length": 899.8250122070312,
1576
+ "epoch": 0.14035087719298245,
1577
+ "grad_norm": 0.7146245837211609,
1578
+ "kl": 1.0164794921875,
1579
+ "learning_rate": 2.818843167645835e-07,
1580
+ "loss": 0.0407,
1581
+ "reward": 2.0691821854561567,
1582
+ "reward_std": 3.262540274299681,
1583
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6441821809858084,
1584
+ "rewards/all-MiniLM-L6-v2": -0.23750000074505806,
1585
+ "rewards/format_reward_beginEndSolutionFormat": 1.2124999957159162,
1586
+ "rewards/length_reward": 0.4500000076368451,
1587
+ "step": 206
1588
+ },
1589
+ {
1590
+ "completion_length": 891.0625190734863,
1591
+ "epoch": 0.14171350706864247,
1592
+ "grad_norm": 0.4813491702079773,
1593
+ "kl": 0.4830322265625,
1594
+ "learning_rate": 2.119755378522137e-07,
1595
+ "loss": 0.0193,
1596
+ "reward": 2.4804067488294095,
1597
+ "reward_std": 2.613793831784278,
1598
+ "rewards/AnswerChecker_LenCheck_Reward": 0.680406715720892,
1599
+ "rewards/all-MiniLM-L6-v2": -0.04999999608844519,
1600
+ "rewards/format_reward_beginEndSolutionFormat": 1.325000001117587,
1601
+ "rewards/length_reward": 0.5250000078231096,
1602
+ "step": 208
1603
+ },
1604
+ {
1605
+ "completion_length": 837.337516784668,
1606
+ "epoch": 0.1430761369443025,
1607
+ "grad_norm": 0.4959147870540619,
1608
+ "kl": 0.4730224609375,
1609
+ "learning_rate": 1.519224698779198e-07,
1610
+ "loss": 0.0189,
1611
+ "reward": 2.9152769446372986,
1612
+ "reward_std": 2.3198010358028114,
1613
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7527768295258284,
1614
+ "rewards/all-MiniLM-L6-v2": 1.862645149230957e-09,
1615
+ "rewards/format_reward_beginEndSolutionFormat": 1.5874999957159162,
1616
+ "rewards/length_reward": 0.5750000039115548,
1617
+ "step": 210
1618
+ },
1619
+ {
1620
+ "completion_length": 871.5125160217285,
1621
+ "epoch": 0.14443876681996254,
1622
+ "grad_norm": 0.9553112387657166,
1623
+ "kl": 0.5531005859375,
1624
+ "learning_rate": 1.0178558119067316e-07,
1625
+ "loss": 0.0221,
1626
+ "reward": 2.6484466195106506,
1627
+ "reward_std": 2.807733765337616,
1628
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6984465261921287,
1629
+ "rewards/all-MiniLM-L6-v2": 0.08750000223517418,
1630
+ "rewards/format_reward_beginEndSolutionFormat": 1.2874999893829226,
1631
+ "rewards/length_reward": 0.5750000076368451,
1632
+ "step": 212
1633
+ },
1634
+ {
1635
+ "completion_length": 909.4750137329102,
1636
+ "epoch": 0.14580139669562256,
1637
+ "grad_norm": 0.5723004937171936,
1638
+ "kl": 0.57098388671875,
1639
+ "learning_rate": 6.161535538745877e-08,
1640
+ "loss": 0.0229,
1641
+ "reward": 1.5668016755953431,
1642
+ "reward_std": 2.593209580751136,
1643
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6418016445823014,
1644
+ "rewards/all-MiniLM-L6-v2": -0.5000000027939677,
1645
+ "rewards/format_reward_beginEndSolutionFormat": 1.0249999966472387,
1646
+ "rewards/length_reward": 0.40000000316649675,
1647
+ "step": 214
1648
+ },
1649
+ {
1650
+ "completion_length": 968.3375205993652,
1651
+ "epoch": 0.14716402657128258,
1652
+ "grad_norm": 0.4992906153202057,
1653
+ "kl": 0.6209716796875,
1654
+ "learning_rate": 3.1452240480577265e-08,
1655
+ "loss": 0.0248,
1656
+ "reward": 1.5952948506455868,
1657
+ "reward_std": 3.052502178354189,
1658
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6452948525547981,
1659
+ "rewards/all-MiniLM-L6-v2": -0.462499993853271,
1660
+ "rewards/format_reward_beginEndSolutionFormat": 1.062499993480742,
1661
+ "rewards/length_reward": 0.350000006146729,
1662
+ "step": 216
1663
+ },
1664
+ {
1665
+ "completion_length": 888.4625244140625,
1666
+ "epoch": 0.1485266564469426,
1667
+ "grad_norm": 0.5244458913803101,
1668
+ "kl": 0.5447998046875,
1669
+ "learning_rate": 1.1326608169920373e-08,
1670
+ "loss": 0.0218,
1671
+ "reward": 2.0069867819547653,
1672
+ "reward_std": 2.44455174729228,
1673
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6694867238402367,
1674
+ "rewards/all-MiniLM-L6-v2": -0.4124999986961484,
1675
+ "rewards/format_reward_beginEndSolutionFormat": 1.2499999990686774,
1676
+ "rewards/length_reward": 0.5000000046566129,
1677
+ "step": 218
1678
+ },
1679
+ {
1680
+ "completion_length": 940.7250213623047,
1681
+ "epoch": 0.14988928632260262,
1682
+ "grad_norm": 0.45825526118278503,
1683
+ "kl": 0.569427490234375,
1684
+ "learning_rate": 1.2587232612493172e-09,
1685
+ "loss": 0.0228,
1686
+ "reward": 2.0391996294492856,
1687
+ "reward_std": 3.5446557994000614,
1688
+ "rewards/AnswerChecker_LenCheck_Reward": 0.6266996059566736,
1689
+ "rewards/all-MiniLM-L6-v2": -0.04999999608844519,
1690
+ "rewards/format_reward_beginEndSolutionFormat": 1.0624999953433871,
1691
+ "rewards/length_reward": 0.40000000689178705,
1692
+ "step": 220
1693
+ },
1694
+ {
1695
+ "completion_length": 826.150016784668,
1696
+ "epoch": 0.15057060126043265,
1697
+ "kl": 0.503173828125,
1698
+ "reward": 2.8795320093631744,
1699
+ "reward_std": 2.7599759995937347,
1700
+ "rewards/AnswerChecker_LenCheck_Reward": 0.7045319639146328,
1701
+ "rewards/all-MiniLM-L6-v2": -0.04999999701976776,
1702
+ "rewards/format_reward_beginEndSolutionFormat": 1.6249999925494194,
1703
+ "rewards/length_reward": 0.600000012665987,
1704
+ "step": 221,
1705
+ "total_flos": 0.0,
1706
+ "train_loss": 0.014638028192195337,
1707
+ "train_runtime": 22832.117,
1708
+ "train_samples_per_second": 0.077,
1709
+ "train_steps_per_second": 0.01
1710
+ }
1711
+ ],
1712
+ "logging_steps": 2,
1713
+ "max_steps": 221,
1714
+ "num_input_tokens_seen": 0,
1715
+ "num_train_epochs": 1,
1716
+ "save_steps": 30,
1717
+ "stateful_callbacks": {
1718
+ "TrainerControl": {
1719
+ "args": {
1720
+ "should_epoch_stop": false,
1721
+ "should_evaluate": false,
1722
+ "should_log": false,
1723
+ "should_save": false,
1724
+ "should_training_stop": false
1725
+ },
1726
+ "attributes": {}
1727
+ }
1728
+ },
1729
+ "total_flos": 0.0,
1730
+ "train_batch_size": 5,
1731
+ "trial_name": null,
1732
+ "trial_params": null
1733
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5f63d3e5d81f6c418113f92a3e8871ea668566937438b2bd6902a667d61bce3
3
+ size 7800