File size: 72,356 Bytes
e30f802
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 100,
  "global_step": 84,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "debug/policy_chosen_logits": -3.2206931114196777,
      "debug/policy_chosen_logps": -200.86012268066406,
      "debug/policy_rejected_logits": -2.898437738418579,
      "debug/policy_rejected_logps": -208.0646514892578,
      "debug/reference_chosen_logps": -200.86012268066406,
      "debug/reference_rejected_logps": -208.0646514892578,
      "epoch": 0.011904761904761904,
      "grad_norm": 7.909108829449386,
      "learning_rate": 1e-06,
      "logits/chosen": -3.2206931114196777,
      "logits/rejected": -2.898437738418579,
      "logps/chosen": -200.86012268066406,
      "logps/rejected": -208.0646514892578,
      "loss": 0.5,
      "rewards/accuracies": 0.0,
      "rewards/chosen": 0.0,
      "rewards/margins": 0.0,
      "rewards/rejected": 0.0,
      "step": 1
    },
    {
      "debug/policy_chosen_logits": -3.1326141357421875,
      "debug/policy_chosen_logps": -219.65463256835938,
      "debug/policy_rejected_logits": -3.1044466495513916,
      "debug/policy_rejected_logps": -218.29165649414062,
      "debug/reference_chosen_logps": -219.87649536132812,
      "debug/reference_rejected_logps": -218.21566772460938,
      "epoch": 0.023809523809523808,
      "grad_norm": 7.063786345467847,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1326141357421875,
      "logits/rejected": -3.1044466495513916,
      "logps/chosen": -219.65463256835938,
      "logps/rejected": -218.29165649414062,
      "loss": 0.4983,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.0022186278365552425,
      "rewards/margins": 0.0029784394428133965,
      "rewards/rejected": -0.0007598113734275103,
      "step": 2
    },
    {
      "debug/policy_chosen_logits": -3.135671377182007,
      "debug/policy_chosen_logps": -208.30648803710938,
      "debug/policy_rejected_logits": -3.0028326511383057,
      "debug/policy_rejected_logps": -239.4796142578125,
      "debug/reference_chosen_logps": -208.16616821289062,
      "debug/reference_rejected_logps": -239.21315002441406,
      "epoch": 0.03571428571428571,
      "grad_norm": 7.814141071934522,
      "learning_rate": 1e-06,
      "logits/chosen": -3.135671377182007,
      "logits/rejected": -3.0028326511383057,
      "logps/chosen": -208.30648803710938,
      "logps/rejected": -239.4796142578125,
      "loss": 0.4933,
      "rewards/accuracies": 0.5,
      "rewards/chosen": -0.0014033315237611532,
      "rewards/margins": 0.0012612915597856045,
      "rewards/rejected": -0.0026646230835467577,
      "step": 3
    },
    {
      "debug/policy_chosen_logits": -3.187422275543213,
      "debug/policy_chosen_logps": -217.25140380859375,
      "debug/policy_rejected_logits": -3.1444270610809326,
      "debug/policy_rejected_logps": -219.7812957763672,
      "debug/reference_chosen_logps": -217.35531616210938,
      "debug/reference_rejected_logps": -218.98512268066406,
      "epoch": 0.047619047619047616,
      "grad_norm": 7.09702591230904,
      "learning_rate": 1e-06,
      "logits/chosen": -3.187422275543213,
      "logits/rejected": -3.1444270610809326,
      "logps/chosen": -217.25140380859375,
      "logps/rejected": -219.7812957763672,
      "loss": 0.4896,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.001039218856021762,
      "rewards/margins": 0.009000949561595917,
      "rewards/rejected": -0.007961731404066086,
      "step": 4
    },
    {
      "debug/policy_chosen_logits": -3.0982069969177246,
      "debug/policy_chosen_logps": -216.79022216796875,
      "debug/policy_rejected_logits": -2.9893712997436523,
      "debug/policy_rejected_logps": -236.919189453125,
      "debug/reference_chosen_logps": -217.1226806640625,
      "debug/reference_rejected_logps": -235.52293395996094,
      "epoch": 0.05952380952380952,
      "grad_norm": 5.630512315088139,
      "learning_rate": 1e-06,
      "logits/chosen": -3.0982069969177246,
      "logits/rejected": -2.9893712997436523,
      "logps/chosen": -216.79022216796875,
      "logps/rejected": -236.919189453125,
      "loss": 0.4832,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.0033246802631765604,
      "rewards/margins": 0.01728721708059311,
      "rewards/rejected": -0.013962535187602043,
      "step": 5
    },
    {
      "debug/policy_chosen_logits": -3.1938157081604004,
      "debug/policy_chosen_logps": -207.0302734375,
      "debug/policy_rejected_logits": -3.085257053375244,
      "debug/policy_rejected_logps": -217.99813842773438,
      "debug/reference_chosen_logps": -207.5093536376953,
      "debug/reference_rejected_logps": -216.325927734375,
      "epoch": 0.07142857142857142,
      "grad_norm": 5.70424011476702,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1938157081604004,
      "logits/rejected": -3.085257053375244,
      "logps/chosen": -207.0302734375,
      "logps/rejected": -217.99813842773438,
      "loss": 0.4773,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.004790782928466797,
      "rewards/margins": 0.02151281200349331,
      "rewards/rejected": -0.01672203093767166,
      "step": 6
    },
    {
      "debug/policy_chosen_logits": -3.163257122039795,
      "debug/policy_chosen_logps": -211.92942810058594,
      "debug/policy_rejected_logits": -3.0449893474578857,
      "debug/policy_rejected_logps": -214.0583953857422,
      "debug/reference_chosen_logps": -212.87844848632812,
      "debug/reference_rejected_logps": -212.81723022460938,
      "epoch": 0.08333333333333333,
      "grad_norm": 6.547599704469646,
      "learning_rate": 1e-06,
      "logits/chosen": -3.163257122039795,
      "logits/rejected": -3.0449893474578857,
      "logps/chosen": -211.92942810058594,
      "logps/rejected": -214.0583953857422,
      "loss": 0.4684,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.00949014537036419,
      "rewards/margins": 0.021901872009038925,
      "rewards/rejected": -0.012411728501319885,
      "step": 7
    },
    {
      "debug/policy_chosen_logits": -3.2458667755126953,
      "debug/policy_chosen_logps": -197.6811065673828,
      "debug/policy_rejected_logits": -3.0591673851013184,
      "debug/policy_rejected_logps": -231.7470245361328,
      "debug/reference_chosen_logps": -199.1888427734375,
      "debug/reference_rejected_logps": -228.80226135253906,
      "epoch": 0.09523809523809523,
      "grad_norm": 7.248047931654466,
      "learning_rate": 1e-06,
      "logits/chosen": -3.2458667755126953,
      "logits/rejected": -3.0591673851013184,
      "logps/chosen": -197.6811065673828,
      "logps/rejected": -231.7470245361328,
      "loss": 0.4588,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.015077304095029831,
      "rewards/margins": 0.04452499374747276,
      "rewards/rejected": -0.029447689652442932,
      "step": 8
    },
    {
      "debug/policy_chosen_logits": -3.212696075439453,
      "debug/policy_chosen_logps": -196.501708984375,
      "debug/policy_rejected_logits": -3.0544850826263428,
      "debug/policy_rejected_logps": -217.50942993164062,
      "debug/reference_chosen_logps": -198.6233673095703,
      "debug/reference_rejected_logps": -214.6073760986328,
      "epoch": 0.10714285714285714,
      "grad_norm": 6.028275040076952,
      "learning_rate": 1e-06,
      "logits/chosen": -3.212696075439453,
      "logits/rejected": -3.0544850826263428,
      "logps/chosen": -196.501708984375,
      "logps/rejected": -217.50942993164062,
      "loss": 0.4341,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.021216563880443573,
      "rewards/margins": 0.050237104296684265,
      "rewards/rejected": -0.029020538553595543,
      "step": 9
    },
    {
      "debug/policy_chosen_logits": -3.1316659450531006,
      "debug/policy_chosen_logps": -216.9590301513672,
      "debug/policy_rejected_logits": -3.0151987075805664,
      "debug/policy_rejected_logps": -225.83355712890625,
      "debug/reference_chosen_logps": -221.61790466308594,
      "debug/reference_rejected_logps": -219.59786987304688,
      "epoch": 0.11904761904761904,
      "grad_norm": 5.4813634886972515,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1316659450531006,
      "logits/rejected": -3.0151987075805664,
      "logps/chosen": -216.9590301513672,
      "logps/rejected": -225.83355712890625,
      "loss": 0.4064,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.046588800847530365,
      "rewards/margins": 0.10894560813903809,
      "rewards/rejected": -0.06235681474208832,
      "step": 10
    },
    {
      "debug/policy_chosen_logits": -3.2891530990600586,
      "debug/policy_chosen_logps": -196.72274780273438,
      "debug/policy_rejected_logits": -3.048377513885498,
      "debug/policy_rejected_logps": -233.62039184570312,
      "debug/reference_chosen_logps": -202.43775939941406,
      "debug/reference_rejected_logps": -216.876708984375,
      "epoch": 0.13095238095238096,
      "grad_norm": 4.876535878960536,
      "learning_rate": 1e-06,
      "logits/chosen": -3.2891530990600586,
      "logits/rejected": -3.048377513885498,
      "logps/chosen": -196.72274780273438,
      "logps/rejected": -233.62039184570312,
      "loss": 0.4083,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.05715004354715347,
      "rewards/margins": 0.22458696365356445,
      "rewards/rejected": -0.16743692755699158,
      "step": 11
    },
    {
      "debug/policy_chosen_logits": -3.1717939376831055,
      "debug/policy_chosen_logps": -210.72622680664062,
      "debug/policy_rejected_logits": -3.060811758041382,
      "debug/policy_rejected_logps": -230.44659423828125,
      "debug/reference_chosen_logps": -219.1645050048828,
      "debug/reference_rejected_logps": -223.36727905273438,
      "epoch": 0.14285714285714285,
      "grad_norm": 5.9567820612609585,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1717939376831055,
      "logits/rejected": -3.060811758041382,
      "logps/chosen": -210.72622680664062,
      "logps/rejected": -230.44659423828125,
      "loss": 0.3617,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.08438283205032349,
      "rewards/margins": 0.155176043510437,
      "rewards/rejected": -0.07079321146011353,
      "step": 12
    },
    {
      "debug/policy_chosen_logits": -3.175966739654541,
      "debug/policy_chosen_logps": -198.75677490234375,
      "debug/policy_rejected_logits": -3.0943973064422607,
      "debug/policy_rejected_logps": -209.6800994873047,
      "debug/reference_chosen_logps": -206.38829040527344,
      "debug/reference_rejected_logps": -198.82118225097656,
      "epoch": 0.15476190476190477,
      "grad_norm": 4.372108578101748,
      "learning_rate": 1e-06,
      "logits/chosen": -3.175966739654541,
      "logits/rejected": -3.0943973064422607,
      "logps/chosen": -198.75677490234375,
      "logps/rejected": -209.6800994873047,
      "loss": 0.378,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.07631513476371765,
      "rewards/margins": 0.1849043220281601,
      "rewards/rejected": -0.10858918726444244,
      "step": 13
    },
    {
      "debug/policy_chosen_logits": -3.17907977104187,
      "debug/policy_chosen_logps": -204.30484008789062,
      "debug/policy_rejected_logits": -3.0033490657806396,
      "debug/policy_rejected_logps": -224.59976196289062,
      "debug/reference_chosen_logps": -215.46783447265625,
      "debug/reference_rejected_logps": -213.1477813720703,
      "epoch": 0.16666666666666666,
      "grad_norm": 4.951824369660157,
      "learning_rate": 1e-06,
      "logits/chosen": -3.17907977104187,
      "logits/rejected": -3.0033490657806396,
      "logps/chosen": -204.30484008789062,
      "logps/rejected": -224.59976196289062,
      "loss": 0.354,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.1116299033164978,
      "rewards/margins": 0.22614973783493042,
      "rewards/rejected": -0.11451983451843262,
      "step": 14
    },
    {
      "debug/policy_chosen_logits": -3.0961408615112305,
      "debug/policy_chosen_logps": -209.14535522460938,
      "debug/policy_rejected_logits": -3.00311279296875,
      "debug/policy_rejected_logps": -224.08718872070312,
      "debug/reference_chosen_logps": -220.22549438476562,
      "debug/reference_rejected_logps": -222.09124755859375,
      "epoch": 0.17857142857142858,
      "grad_norm": 4.835613329592286,
      "learning_rate": 1e-06,
      "logits/chosen": -3.0961408615112305,
      "logits/rejected": -3.00311279296875,
      "logps/chosen": -209.14535522460938,
      "logps/rejected": -224.08718872070312,
      "loss": 0.386,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.11080135405063629,
      "rewards/margins": 0.13076072931289673,
      "rewards/rejected": -0.019959375262260437,
      "step": 15
    },
    {
      "debug/policy_chosen_logits": -3.2088327407836914,
      "debug/policy_chosen_logps": -206.8312530517578,
      "debug/policy_rejected_logits": -3.0719218254089355,
      "debug/policy_rejected_logps": -226.76593017578125,
      "debug/reference_chosen_logps": -218.99668884277344,
      "debug/reference_rejected_logps": -211.85366821289062,
      "epoch": 0.19047619047619047,
      "grad_norm": 4.544057800793572,
      "learning_rate": 1e-06,
      "logits/chosen": -3.2088327407836914,
      "logits/rejected": -3.0719218254089355,
      "logps/chosen": -206.8312530517578,
      "logps/rejected": -226.76593017578125,
      "loss": 0.3723,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.12165433168411255,
      "rewards/margins": 0.27077698707580566,
      "rewards/rejected": -0.1491226851940155,
      "step": 16
    },
    {
      "debug/policy_chosen_logits": -3.1967031955718994,
      "debug/policy_chosen_logps": -206.11988830566406,
      "debug/policy_rejected_logits": -3.192483425140381,
      "debug/policy_rejected_logps": -230.90736389160156,
      "debug/reference_chosen_logps": -221.86349487304688,
      "debug/reference_rejected_logps": -223.01187133789062,
      "epoch": 0.20238095238095238,
      "grad_norm": 4.376140545216712,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1967031955718994,
      "logits/rejected": -3.192483425140381,
      "logps/chosen": -206.11988830566406,
      "logps/rejected": -230.90736389160156,
      "loss": 0.3998,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.15743595361709595,
      "rewards/margins": 0.23639078438282013,
      "rewards/rejected": -0.07895481586456299,
      "step": 17
    },
    {
      "debug/policy_chosen_logits": -3.1152353286743164,
      "debug/policy_chosen_logps": -193.357177734375,
      "debug/policy_rejected_logits": -2.959299087524414,
      "debug/policy_rejected_logps": -263.7359313964844,
      "debug/reference_chosen_logps": -210.5402069091797,
      "debug/reference_rejected_logps": -232.1613311767578,
      "epoch": 0.21428571428571427,
      "grad_norm": 7.866472140538224,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1152353286743164,
      "logits/rejected": -2.959299087524414,
      "logps/chosen": -193.357177734375,
      "logps/rejected": -263.7359313964844,
      "loss": 0.4077,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.17183025181293488,
      "rewards/margins": 0.4875761568546295,
      "rewards/rejected": -0.31574589014053345,
      "step": 18
    },
    {
      "debug/policy_chosen_logits": -3.2761292457580566,
      "debug/policy_chosen_logps": -185.4181365966797,
      "debug/policy_rejected_logits": -3.269240379333496,
      "debug/policy_rejected_logps": -225.70327758789062,
      "debug/reference_chosen_logps": -202.86187744140625,
      "debug/reference_rejected_logps": -228.69943237304688,
      "epoch": 0.2261904761904762,
      "grad_norm": 7.929981237103111,
      "learning_rate": 1e-06,
      "logits/chosen": -3.2761292457580566,
      "logits/rejected": -3.269240379333496,
      "logps/chosen": -185.4181365966797,
      "logps/rejected": -225.70327758789062,
      "loss": 0.3953,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.17443738877773285,
      "rewards/margins": 0.14447586238384247,
      "rewards/rejected": 0.02996152639389038,
      "step": 19
    },
    {
      "debug/policy_chosen_logits": -3.2340097427368164,
      "debug/policy_chosen_logps": -193.43096923828125,
      "debug/policy_rejected_logits": -3.099465847015381,
      "debug/policy_rejected_logps": -230.0125732421875,
      "debug/reference_chosen_logps": -216.61485290527344,
      "debug/reference_rejected_logps": -211.0814208984375,
      "epoch": 0.23809523809523808,
      "grad_norm": 8.640304701773179,
      "learning_rate": 1e-06,
      "logits/chosen": -3.2340097427368164,
      "logits/rejected": -3.099465847015381,
      "logps/chosen": -193.43096923828125,
      "logps/rejected": -230.0125732421875,
      "loss": 0.3575,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.23183873295783997,
      "rewards/margins": 0.42115041613578796,
      "rewards/rejected": -0.189311683177948,
      "step": 20
    },
    {
      "debug/policy_chosen_logits": -3.162797689437866,
      "debug/policy_chosen_logps": -191.775146484375,
      "debug/policy_rejected_logits": -3.2479846477508545,
      "debug/policy_rejected_logps": -217.1043243408203,
      "debug/reference_chosen_logps": -215.61062622070312,
      "debug/reference_rejected_logps": -209.42953491210938,
      "epoch": 0.25,
      "grad_norm": 7.114249349006281,
      "learning_rate": 1e-06,
      "logits/chosen": -3.162797689437866,
      "logits/rejected": -3.2479846477508545,
      "logps/chosen": -191.775146484375,
      "logps/rejected": -217.1043243408203,
      "loss": 0.4064,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.23835478723049164,
      "rewards/margins": 0.3151026964187622,
      "rewards/rejected": -0.07674790918827057,
      "step": 21
    },
    {
      "debug/policy_chosen_logits": -3.3014657497406006,
      "debug/policy_chosen_logps": -185.117919921875,
      "debug/policy_rejected_logits": -3.2064030170440674,
      "debug/policy_rejected_logps": -231.311279296875,
      "debug/reference_chosen_logps": -208.03945922851562,
      "debug/reference_rejected_logps": -204.46234130859375,
      "epoch": 0.2619047619047619,
      "grad_norm": 6.717051925734186,
      "learning_rate": 1e-06,
      "logits/chosen": -3.3014657497406006,
      "logits/rejected": -3.2064030170440674,
      "logps/chosen": -185.117919921875,
      "logps/rejected": -231.311279296875,
      "loss": 0.4124,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.22921523451805115,
      "rewards/margins": 0.4977045953273773,
      "rewards/rejected": -0.26848936080932617,
      "step": 22
    },
    {
      "debug/policy_chosen_logits": -3.2503294944763184,
      "debug/policy_chosen_logps": -193.4159698486328,
      "debug/policy_rejected_logits": -3.1396243572235107,
      "debug/policy_rejected_logps": -254.65135192871094,
      "debug/reference_chosen_logps": -217.694580078125,
      "debug/reference_rejected_logps": -212.81295776367188,
      "epoch": 0.27380952380952384,
      "grad_norm": 7.339797017549244,
      "learning_rate": 1e-06,
      "logits/chosen": -3.2503294944763184,
      "logits/rejected": -3.1396243572235107,
      "logps/chosen": -193.4159698486328,
      "logps/rejected": -254.65135192871094,
      "loss": 0.3197,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.24278610944747925,
      "rewards/margins": 0.6611701250076294,
      "rewards/rejected": -0.41838398575782776,
      "step": 23
    },
    {
      "debug/policy_chosen_logits": -3.299981117248535,
      "debug/policy_chosen_logps": -185.9925537109375,
      "debug/policy_rejected_logits": -3.0819525718688965,
      "debug/policy_rejected_logps": -258.153076171875,
      "debug/reference_chosen_logps": -212.69705200195312,
      "debug/reference_rejected_logps": -221.9197235107422,
      "epoch": 0.2857142857142857,
      "grad_norm": 6.4780070021425145,
      "learning_rate": 1e-06,
      "logits/chosen": -3.299981117248535,
      "logits/rejected": -3.0819525718688965,
      "logps/chosen": -185.9925537109375,
      "logps/rejected": -258.153076171875,
      "loss": 0.3351,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.2670450806617737,
      "rewards/margins": 0.629378616809845,
      "rewards/rejected": -0.3623335361480713,
      "step": 24
    },
    {
      "debug/policy_chosen_logits": -3.3384597301483154,
      "debug/policy_chosen_logps": -179.01553344726562,
      "debug/policy_rejected_logits": -3.2596333026885986,
      "debug/policy_rejected_logps": -230.0491180419922,
      "debug/reference_chosen_logps": -201.91806030273438,
      "debug/reference_rejected_logps": -212.782958984375,
      "epoch": 0.2976190476190476,
      "grad_norm": 5.646156232636996,
      "learning_rate": 1e-06,
      "logits/chosen": -3.3384597301483154,
      "logits/rejected": -3.2596333026885986,
      "logps/chosen": -179.01553344726562,
      "logps/rejected": -230.0491180419922,
      "loss": 0.3597,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.2290252447128296,
      "rewards/margins": 0.40168675780296326,
      "rewards/rejected": -0.17266148328781128,
      "step": 25
    },
    {
      "debug/policy_chosen_logits": -3.2951643466949463,
      "debug/policy_chosen_logps": -187.70423889160156,
      "debug/policy_rejected_logits": -3.064873218536377,
      "debug/policy_rejected_logps": -259.9974365234375,
      "debug/reference_chosen_logps": -209.17364501953125,
      "debug/reference_rejected_logps": -230.99514770507812,
      "epoch": 0.30952380952380953,
      "grad_norm": 5.004804914940331,
      "learning_rate": 1e-06,
      "logits/chosen": -3.2951643466949463,
      "logits/rejected": -3.064873218536377,
      "logps/chosen": -187.70423889160156,
      "logps/rejected": -259.9974365234375,
      "loss": 0.3607,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.21469400823116302,
      "rewards/margins": 0.5047171115875244,
      "rewards/rejected": -0.2900230884552002,
      "step": 26
    },
    {
      "debug/policy_chosen_logits": -3.205540895462036,
      "debug/policy_chosen_logps": -202.82135009765625,
      "debug/policy_rejected_logits": -3.1168112754821777,
      "debug/policy_rejected_logps": -220.36105346679688,
      "debug/reference_chosen_logps": -219.64862060546875,
      "debug/reference_rejected_logps": -217.84548950195312,
      "epoch": 0.32142857142857145,
      "grad_norm": 7.861811148122209,
      "learning_rate": 1e-06,
      "logits/chosen": -3.205540895462036,
      "logits/rejected": -3.1168112754821777,
      "logps/chosen": -202.82135009765625,
      "logps/rejected": -220.36105346679688,
      "loss": 0.3706,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.16827276349067688,
      "rewards/margins": 0.19342824816703796,
      "rewards/rejected": -0.02515549585223198,
      "step": 27
    },
    {
      "debug/policy_chosen_logits": -3.266860246658325,
      "debug/policy_chosen_logps": -190.13174438476562,
      "debug/policy_rejected_logits": -3.2241365909576416,
      "debug/policy_rejected_logps": -217.80401611328125,
      "debug/reference_chosen_logps": -207.9426727294922,
      "debug/reference_rejected_logps": -213.35520935058594,
      "epoch": 0.3333333333333333,
      "grad_norm": 5.17181036610514,
      "learning_rate": 1e-06,
      "logits/chosen": -3.266860246658325,
      "logits/rejected": -3.2241365909576416,
      "logps/chosen": -190.13174438476562,
      "logps/rejected": -217.80401611328125,
      "loss": 0.381,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.17810919880867004,
      "rewards/margins": 0.22259722650051117,
      "rewards/rejected": -0.044488027691841125,
      "step": 28
    },
    {
      "debug/policy_chosen_logits": -3.138282060623169,
      "debug/policy_chosen_logps": -194.2283935546875,
      "debug/policy_rejected_logits": -3.242144823074341,
      "debug/policy_rejected_logps": -240.74034118652344,
      "debug/reference_chosen_logps": -205.63931274414062,
      "debug/reference_rejected_logps": -223.89479064941406,
      "epoch": 0.34523809523809523,
      "grad_norm": 13.702732402282919,
      "learning_rate": 1e-06,
      "logits/chosen": -3.138282060623169,
      "logits/rejected": -3.242144823074341,
      "logps/chosen": -194.2283935546875,
      "logps/rejected": -240.74034118652344,
      "loss": 0.3319,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.11410927027463913,
      "rewards/margins": 0.2825648784637451,
      "rewards/rejected": -0.1684555858373642,
      "step": 29
    },
    {
      "debug/policy_chosen_logits": -3.2786245346069336,
      "debug/policy_chosen_logps": -196.5978546142578,
      "debug/policy_rejected_logits": -3.1386148929595947,
      "debug/policy_rejected_logps": -242.57998657226562,
      "debug/reference_chosen_logps": -202.89393615722656,
      "debug/reference_rejected_logps": -212.0622100830078,
      "epoch": 0.35714285714285715,
      "grad_norm": 18.8411217134156,
      "learning_rate": 1e-06,
      "logits/chosen": -3.2786245346069336,
      "logits/rejected": -3.1386148929595947,
      "logps/chosen": -196.5978546142578,
      "logps/rejected": -242.57998657226562,
      "loss": 0.3392,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.0629609078168869,
      "rewards/margins": 0.36813876032829285,
      "rewards/rejected": -0.30517783761024475,
      "step": 30
    },
    {
      "debug/policy_chosen_logits": -3.220453977584839,
      "debug/policy_chosen_logps": -210.29718017578125,
      "debug/policy_rejected_logits": -3.0406980514526367,
      "debug/policy_rejected_logps": -254.49639892578125,
      "debug/reference_chosen_logps": -217.5068359375,
      "debug/reference_rejected_logps": -222.9973907470703,
      "epoch": 0.36904761904761907,
      "grad_norm": 17.63556570482219,
      "learning_rate": 1e-06,
      "logits/chosen": -3.220453977584839,
      "logits/rejected": -3.0406980514526367,
      "logps/chosen": -210.29718017578125,
      "logps/rejected": -254.49639892578125,
      "loss": 0.3337,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.0720965787768364,
      "rewards/margins": 0.38708674907684326,
      "rewards/rejected": -0.31499019265174866,
      "step": 31
    },
    {
      "debug/policy_chosen_logits": -3.283477544784546,
      "debug/policy_chosen_logps": -195.7086944580078,
      "debug/policy_rejected_logits": -3.2998650074005127,
      "debug/policy_rejected_logps": -248.77716064453125,
      "debug/reference_chosen_logps": -205.69100952148438,
      "debug/reference_rejected_logps": -238.83279418945312,
      "epoch": 0.38095238095238093,
      "grad_norm": 3.545596888211082,
      "learning_rate": 1e-06,
      "logits/chosen": -3.283477544784546,
      "logits/rejected": -3.2998650074005127,
      "logps/chosen": -195.7086944580078,
      "logps/rejected": -248.77716064453125,
      "loss": 0.3367,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.09982330352067947,
      "rewards/margins": 0.19926708936691284,
      "rewards/rejected": -0.09944379329681396,
      "step": 32
    },
    {
      "debug/policy_chosen_logits": -3.2302284240722656,
      "debug/policy_chosen_logps": -189.14857482910156,
      "debug/policy_rejected_logits": -3.1394155025482178,
      "debug/policy_rejected_logps": -231.4539031982422,
      "debug/reference_chosen_logps": -204.17739868164062,
      "debug/reference_rejected_logps": -209.3110809326172,
      "epoch": 0.39285714285714285,
      "grad_norm": 3.835026591129013,
      "learning_rate": 1e-06,
      "logits/chosen": -3.2302284240722656,
      "logits/rejected": -3.1394155025482178,
      "logps/chosen": -189.14857482910156,
      "logps/rejected": -231.4539031982422,
      "loss": 0.337,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.15028832852840424,
      "rewards/margins": 0.3717164993286133,
      "rewards/rejected": -0.22142818570137024,
      "step": 33
    },
    {
      "debug/policy_chosen_logits": -3.2129104137420654,
      "debug/policy_chosen_logps": -199.6995849609375,
      "debug/policy_rejected_logits": -3.0475525856018066,
      "debug/policy_rejected_logps": -242.47573852539062,
      "debug/reference_chosen_logps": -211.38697814941406,
      "debug/reference_rejected_logps": -218.95986938476562,
      "epoch": 0.40476190476190477,
      "grad_norm": 3.6065017129358563,
      "learning_rate": 1e-06,
      "logits/chosen": -3.2129104137420654,
      "logits/rejected": -3.0475525856018066,
      "logps/chosen": -199.6995849609375,
      "logps/rejected": -242.47573852539062,
      "loss": 0.3252,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.11687390506267548,
      "rewards/margins": 0.35203248262405396,
      "rewards/rejected": -0.23515859246253967,
      "step": 34
    },
    {
      "debug/policy_chosen_logits": -3.0805389881134033,
      "debug/policy_chosen_logps": -200.73974609375,
      "debug/policy_rejected_logits": -3.04288649559021,
      "debug/policy_rejected_logps": -218.5482940673828,
      "debug/reference_chosen_logps": -213.4292449951172,
      "debug/reference_rejected_logps": -223.4329071044922,
      "epoch": 0.4166666666666667,
      "grad_norm": 5.765726016207518,
      "learning_rate": 1e-06,
      "logits/chosen": -3.0805389881134033,
      "logits/rejected": -3.04288649559021,
      "logps/chosen": -200.73974609375,
      "logps/rejected": -218.5482940673828,
      "loss": 0.3783,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.12689509987831116,
      "rewards/margins": 0.07804906368255615,
      "rewards/rejected": 0.048846036195755005,
      "step": 35
    },
    {
      "debug/policy_chosen_logits": -3.339388370513916,
      "debug/policy_chosen_logps": -175.0019073486328,
      "debug/policy_rejected_logits": -3.125450611114502,
      "debug/policy_rejected_logps": -234.52488708496094,
      "debug/reference_chosen_logps": -191.13693237304688,
      "debug/reference_rejected_logps": -211.32481384277344,
      "epoch": 0.42857142857142855,
      "grad_norm": 3.4118668773069563,
      "learning_rate": 1e-06,
      "logits/chosen": -3.339388370513916,
      "logits/rejected": -3.125450611114502,
      "logps/chosen": -175.0019073486328,
      "logps/rejected": -234.52488708496094,
      "loss": 0.324,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.16135042905807495,
      "rewards/margins": 0.3933510482311249,
      "rewards/rejected": -0.23200063407421112,
      "step": 36
    },
    {
      "debug/policy_chosen_logits": -3.1920435428619385,
      "debug/policy_chosen_logps": -200.70831298828125,
      "debug/policy_rejected_logits": -3.0675861835479736,
      "debug/policy_rejected_logps": -240.75643920898438,
      "debug/reference_chosen_logps": -220.32891845703125,
      "debug/reference_rejected_logps": -224.6131591796875,
      "epoch": 0.44047619047619047,
      "grad_norm": 3.775080381030047,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1920435428619385,
      "logits/rejected": -3.0675861835479736,
      "logps/chosen": -200.70831298828125,
      "logps/rejected": -240.75643920898438,
      "loss": 0.3004,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.19620609283447266,
      "rewards/margins": 0.357638955116272,
      "rewards/rejected": -0.16143286228179932,
      "step": 37
    },
    {
      "debug/policy_chosen_logits": -3.173511505126953,
      "debug/policy_chosen_logps": -203.63047790527344,
      "debug/policy_rejected_logits": -3.0617382526397705,
      "debug/policy_rejected_logps": -252.37109375,
      "debug/reference_chosen_logps": -214.03150939941406,
      "debug/reference_rejected_logps": -230.29051208496094,
      "epoch": 0.4523809523809524,
      "grad_norm": 4.456940286801414,
      "learning_rate": 1e-06,
      "logits/chosen": -3.173511505126953,
      "logits/rejected": -3.0617382526397705,
      "logps/chosen": -203.63047790527344,
      "logps/rejected": -252.37109375,
      "loss": 0.3421,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.10401026904582977,
      "rewards/margins": 0.3248162269592285,
      "rewards/rejected": -0.22080595791339874,
      "step": 38
    },
    {
      "debug/policy_chosen_logits": -3.153968095779419,
      "debug/policy_chosen_logps": -215.73043823242188,
      "debug/policy_rejected_logits": -3.1119275093078613,
      "debug/policy_rejected_logps": -230.99270629882812,
      "debug/reference_chosen_logps": -224.58657836914062,
      "debug/reference_rejected_logps": -230.7893829345703,
      "epoch": 0.4642857142857143,
      "grad_norm": 3.7503647290224547,
      "learning_rate": 1e-06,
      "logits/chosen": -3.153968095779419,
      "logits/rejected": -3.1119275093078613,
      "logps/chosen": -215.73043823242188,
      "logps/rejected": -230.99270629882812,
      "loss": 0.3678,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.08856132626533508,
      "rewards/margins": 0.09059463441371918,
      "rewards/rejected": -0.0020333081483840942,
      "step": 39
    },
    {
      "debug/policy_chosen_logits": -3.0754432678222656,
      "debug/policy_chosen_logps": -196.66781616210938,
      "debug/policy_rejected_logits": -2.9984235763549805,
      "debug/policy_rejected_logps": -238.45590209960938,
      "debug/reference_chosen_logps": -210.78643798828125,
      "debug/reference_rejected_logps": -208.17205810546875,
      "epoch": 0.47619047619047616,
      "grad_norm": 4.009273704463349,
      "learning_rate": 1e-06,
      "logits/chosen": -3.0754432678222656,
      "logits/rejected": -2.9984235763549805,
      "logps/chosen": -196.66781616210938,
      "logps/rejected": -238.45590209960938,
      "loss": 0.327,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.14118614792823792,
      "rewards/margins": 0.4440246522426605,
      "rewards/rejected": -0.3028385043144226,
      "step": 40
    },
    {
      "debug/policy_chosen_logits": -3.180971622467041,
      "debug/policy_chosen_logps": -211.52041625976562,
      "debug/policy_rejected_logits": -3.1294796466827393,
      "debug/policy_rejected_logps": -226.37899780273438,
      "debug/reference_chosen_logps": -221.50608825683594,
      "debug/reference_rejected_logps": -220.98367309570312,
      "epoch": 0.4880952380952381,
      "grad_norm": 4.554216689284069,
      "learning_rate": 1e-06,
      "logits/chosen": -3.180971622467041,
      "logits/rejected": -3.1294796466827393,
      "logps/chosen": -211.52041625976562,
      "logps/rejected": -226.37899780273438,
      "loss": 0.3403,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.09985677897930145,
      "rewards/margins": 0.15380997955799103,
      "rewards/rejected": -0.05395320802927017,
      "step": 41
    },
    {
      "debug/policy_chosen_logits": -3.1375789642333984,
      "debug/policy_chosen_logps": -213.60757446289062,
      "debug/policy_rejected_logits": -3.1117618083953857,
      "debug/policy_rejected_logps": -256.28436279296875,
      "debug/reference_chosen_logps": -226.36074829101562,
      "debug/reference_rejected_logps": -242.12509155273438,
      "epoch": 0.5,
      "grad_norm": 4.775957985916952,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1375789642333984,
      "logits/rejected": -3.1117618083953857,
      "logps/chosen": -213.60757446289062,
      "logps/rejected": -256.28436279296875,
      "loss": 0.3186,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.1275317668914795,
      "rewards/margins": 0.26912426948547363,
      "rewards/rejected": -0.14159251749515533,
      "step": 42
    },
    {
      "debug/policy_chosen_logits": -3.1112849712371826,
      "debug/policy_chosen_logps": -208.876220703125,
      "debug/policy_rejected_logits": -3.0731041431427,
      "debug/policy_rejected_logps": -213.07131958007812,
      "debug/reference_chosen_logps": -226.13137817382812,
      "debug/reference_rejected_logps": -202.86383056640625,
      "epoch": 0.5119047619047619,
      "grad_norm": 3.169314540293903,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1112849712371826,
      "logits/rejected": -3.0731041431427,
      "logps/chosen": -208.876220703125,
      "logps/rejected": -213.07131958007812,
      "loss": 0.3156,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.17255139350891113,
      "rewards/margins": 0.2746262848377228,
      "rewards/rejected": -0.10207486897706985,
      "step": 43
    },
    {
      "debug/policy_chosen_logits": -3.1239426136016846,
      "debug/policy_chosen_logps": -201.20697021484375,
      "debug/policy_rejected_logits": -3.087494373321533,
      "debug/policy_rejected_logps": -259.4544372558594,
      "debug/reference_chosen_logps": -221.05288696289062,
      "debug/reference_rejected_logps": -240.31405639648438,
      "epoch": 0.5238095238095238,
      "grad_norm": 3.436483143809452,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1239426136016846,
      "logits/rejected": -3.087494373321533,
      "logps/chosen": -201.20697021484375,
      "logps/rejected": -259.4544372558594,
      "loss": 0.3123,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.19845911860466003,
      "rewards/margins": 0.3898632526397705,
      "rewards/rejected": -0.19140410423278809,
      "step": 44
    },
    {
      "debug/policy_chosen_logits": -3.138341188430786,
      "debug/policy_chosen_logps": -175.82582092285156,
      "debug/policy_rejected_logits": -2.981558084487915,
      "debug/policy_rejected_logps": -231.8880615234375,
      "debug/reference_chosen_logps": -196.50595092773438,
      "debug/reference_rejected_logps": -216.26844787597656,
      "epoch": 0.5357142857142857,
      "grad_norm": 3.953231116742769,
      "learning_rate": 1e-06,
      "logits/chosen": -3.138341188430786,
      "logits/rejected": -2.981558084487915,
      "logps/chosen": -175.82582092285156,
      "logps/rejected": -231.8880615234375,
      "loss": 0.3425,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.2068011611700058,
      "rewards/margins": 0.3629972040653229,
      "rewards/rejected": -0.15619604289531708,
      "step": 45
    },
    {
      "debug/policy_chosen_logits": -3.097372531890869,
      "debug/policy_chosen_logps": -190.10743713378906,
      "debug/policy_rejected_logits": -2.9607269763946533,
      "debug/policy_rejected_logps": -255.23684692382812,
      "debug/reference_chosen_logps": -207.03024291992188,
      "debug/reference_rejected_logps": -224.69744873046875,
      "epoch": 0.5476190476190477,
      "grad_norm": 4.130780539545194,
      "learning_rate": 1e-06,
      "logits/chosen": -3.097372531890869,
      "logits/rejected": -2.9607269763946533,
      "logps/chosen": -190.10743713378906,
      "logps/rejected": -255.23684692382812,
      "loss": 0.334,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.16922807693481445,
      "rewards/margins": 0.4746219217777252,
      "rewards/rejected": -0.30539384484291077,
      "step": 46
    },
    {
      "debug/policy_chosen_logits": -3.1764347553253174,
      "debug/policy_chosen_logps": -204.2672119140625,
      "debug/policy_rejected_logits": -3.070920467376709,
      "debug/policy_rejected_logps": -236.72357177734375,
      "debug/reference_chosen_logps": -222.78765869140625,
      "debug/reference_rejected_logps": -211.966552734375,
      "epoch": 0.5595238095238095,
      "grad_norm": 2.9027881991083926,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1764347553253174,
      "logits/rejected": -3.070920467376709,
      "logps/chosen": -204.2672119140625,
      "logps/rejected": -236.72357177734375,
      "loss": 0.3134,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.18520456552505493,
      "rewards/margins": 0.4327746033668518,
      "rewards/rejected": -0.24757003784179688,
      "step": 47
    },
    {
      "debug/policy_chosen_logits": -3.0339467525482178,
      "debug/policy_chosen_logps": -214.1638641357422,
      "debug/policy_rejected_logits": -3.061377763748169,
      "debug/policy_rejected_logps": -234.35415649414062,
      "debug/reference_chosen_logps": -219.6826171875,
      "debug/reference_rejected_logps": -225.13951110839844,
      "epoch": 0.5714285714285714,
      "grad_norm": 3.492138091854823,
      "learning_rate": 1e-06,
      "logits/chosen": -3.0339467525482178,
      "logits/rejected": -3.061377763748169,
      "logps/chosen": -214.1638641357422,
      "logps/rejected": -234.35415649414062,
      "loss": 0.3227,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.05518750846385956,
      "rewards/margins": 0.14733393490314484,
      "rewards/rejected": -0.09214641898870468,
      "step": 48
    },
    {
      "debug/policy_chosen_logits": -3.1581661701202393,
      "debug/policy_chosen_logps": -185.55007934570312,
      "debug/policy_rejected_logits": -3.0453264713287354,
      "debug/policy_rejected_logps": -262.7301025390625,
      "debug/reference_chosen_logps": -206.8365478515625,
      "debug/reference_rejected_logps": -242.17652893066406,
      "epoch": 0.5833333333333334,
      "grad_norm": 3.1731020410352615,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1581661701202393,
      "logits/rejected": -3.0453264713287354,
      "logps/chosen": -185.55007934570312,
      "logps/rejected": -262.7301025390625,
      "loss": 0.3134,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.21286450326442719,
      "rewards/margins": 0.4184000492095947,
      "rewards/rejected": -0.20553553104400635,
      "step": 49
    },
    {
      "debug/policy_chosen_logits": -3.206784725189209,
      "debug/policy_chosen_logps": -170.53402709960938,
      "debug/policy_rejected_logits": -2.9109795093536377,
      "debug/policy_rejected_logps": -258.2296142578125,
      "debug/reference_chosen_logps": -196.68124389648438,
      "debug/reference_rejected_logps": -230.39051818847656,
      "epoch": 0.5952380952380952,
      "grad_norm": 2.934981229097639,
      "learning_rate": 1e-06,
      "logits/chosen": -3.206784725189209,
      "logits/rejected": -2.9109795093536377,
      "logps/chosen": -170.53402709960938,
      "logps/rejected": -258.2296142578125,
      "loss": 0.2659,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.26147204637527466,
      "rewards/margins": 0.5398629307746887,
      "rewards/rejected": -0.27839091420173645,
      "step": 50
    },
    {
      "debug/policy_chosen_logits": -3.0622358322143555,
      "debug/policy_chosen_logps": -200.63052368164062,
      "debug/policy_rejected_logits": -2.947862148284912,
      "debug/policy_rejected_logps": -252.6195068359375,
      "debug/reference_chosen_logps": -221.46884155273438,
      "debug/reference_rejected_logps": -228.0500030517578,
      "epoch": 0.6071428571428571,
      "grad_norm": 3.5442225737215387,
      "learning_rate": 1e-06,
      "logits/chosen": -3.0622358322143555,
      "logits/rejected": -2.947862148284912,
      "logps/chosen": -200.63052368164062,
      "logps/rejected": -252.6195068359375,
      "loss": 0.3124,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.20838311314582825,
      "rewards/margins": 0.4540780782699585,
      "rewards/rejected": -0.24569493532180786,
      "step": 51
    },
    {
      "debug/policy_chosen_logits": -3.1601505279541016,
      "debug/policy_chosen_logps": -182.13150024414062,
      "debug/policy_rejected_logits": -2.988621711730957,
      "debug/policy_rejected_logps": -228.06826782226562,
      "debug/reference_chosen_logps": -201.15792846679688,
      "debug/reference_rejected_logps": -213.58132934570312,
      "epoch": 0.6190476190476191,
      "grad_norm": 4.536944665819507,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1601505279541016,
      "logits/rejected": -2.988621711730957,
      "logps/chosen": -182.13150024414062,
      "logps/rejected": -228.06826782226562,
      "loss": 0.3305,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.19026409089565277,
      "rewards/margins": 0.33513346314430237,
      "rewards/rejected": -0.1448693871498108,
      "step": 52
    },
    {
      "debug/policy_chosen_logits": -3.198798894882202,
      "debug/policy_chosen_logps": -189.36521911621094,
      "debug/policy_rejected_logits": -3.0948612689971924,
      "debug/policy_rejected_logps": -224.48573303222656,
      "debug/reference_chosen_logps": -206.97372436523438,
      "debug/reference_rejected_logps": -216.28659057617188,
      "epoch": 0.6309523809523809,
      "grad_norm": 3.1233076322027067,
      "learning_rate": 1e-06,
      "logits/chosen": -3.198798894882202,
      "logits/rejected": -3.0948612689971924,
      "logps/chosen": -189.36521911621094,
      "logps/rejected": -224.48573303222656,
      "loss": 0.2912,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.17608505487442017,
      "rewards/margins": 0.25807660818099976,
      "rewards/rejected": -0.08199156075716019,
      "step": 53
    },
    {
      "debug/policy_chosen_logits": -3.0483901500701904,
      "debug/policy_chosen_logps": -220.47686767578125,
      "debug/policy_rejected_logits": -3.0346791744232178,
      "debug/policy_rejected_logps": -233.89801025390625,
      "debug/reference_chosen_logps": -234.3529052734375,
      "debug/reference_rejected_logps": -234.83953857421875,
      "epoch": 0.6428571428571429,
      "grad_norm": 4.209474032104615,
      "learning_rate": 1e-06,
      "logits/chosen": -3.0483901500701904,
      "logits/rejected": -3.0346791744232178,
      "logps/chosen": -220.47686767578125,
      "logps/rejected": -233.89801025390625,
      "loss": 0.3198,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.1387602984905243,
      "rewards/margins": 0.12934501469135284,
      "rewards/rejected": 0.00941528007388115,
      "step": 54
    },
    {
      "debug/policy_chosen_logits": -3.171710968017578,
      "debug/policy_chosen_logps": -177.91751098632812,
      "debug/policy_rejected_logits": -3.0502405166625977,
      "debug/policy_rejected_logps": -286.4560852050781,
      "debug/reference_chosen_logps": -200.41464233398438,
      "debug/reference_rejected_logps": -257.74066162109375,
      "epoch": 0.6547619047619048,
      "grad_norm": 4.118989628914695,
      "learning_rate": 1e-06,
      "logits/chosen": -3.171710968017578,
      "logits/rejected": -3.0502405166625977,
      "logps/chosen": -177.91751098632812,
      "logps/rejected": -286.4560852050781,
      "loss": 0.2885,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.22497142851352692,
      "rewards/margins": 0.5121252536773682,
      "rewards/rejected": -0.28715386986732483,
      "step": 55
    },
    {
      "debug/policy_chosen_logits": -3.09352445602417,
      "debug/policy_chosen_logps": -201.69338989257812,
      "debug/policy_rejected_logits": -2.858083486557007,
      "debug/policy_rejected_logps": -237.42831420898438,
      "debug/reference_chosen_logps": -221.5948944091797,
      "debug/reference_rejected_logps": -233.11854553222656,
      "epoch": 0.6666666666666666,
      "grad_norm": 5.4156894530313835,
      "learning_rate": 1e-06,
      "logits/chosen": -3.09352445602417,
      "logits/rejected": -2.858083486557007,
      "logps/chosen": -201.69338989257812,
      "logps/rejected": -237.42831420898438,
      "loss": 0.3487,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.19901515543460846,
      "rewards/margins": 0.24211297929286957,
      "rewards/rejected": -0.04309781268239021,
      "step": 56
    },
    {
      "debug/policy_chosen_logits": -3.0208303928375244,
      "debug/policy_chosen_logps": -204.16375732421875,
      "debug/policy_rejected_logits": -2.9121055603027344,
      "debug/policy_rejected_logps": -266.00836181640625,
      "debug/reference_chosen_logps": -221.49114990234375,
      "debug/reference_rejected_logps": -238.82809448242188,
      "epoch": 0.6785714285714286,
      "grad_norm": 3.4258355626960837,
      "learning_rate": 1e-06,
      "logits/chosen": -3.0208303928375244,
      "logits/rejected": -2.9121055603027344,
      "logps/chosen": -204.16375732421875,
      "logps/rejected": -266.00836181640625,
      "loss": 0.3158,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.17327386140823364,
      "rewards/margins": 0.44507646560668945,
      "rewards/rejected": -0.2718026041984558,
      "step": 57
    },
    {
      "debug/policy_chosen_logits": -3.2102866172790527,
      "debug/policy_chosen_logps": -218.61669921875,
      "debug/policy_rejected_logits": -3.0547966957092285,
      "debug/policy_rejected_logps": -265.02374267578125,
      "debug/reference_chosen_logps": -230.63229370117188,
      "debug/reference_rejected_logps": -222.15362548828125,
      "epoch": 0.6904761904761905,
      "grad_norm": 6.73796058848574,
      "learning_rate": 1e-06,
      "logits/chosen": -3.2102866172790527,
      "logits/rejected": -3.0547966957092285,
      "logps/chosen": -218.61669921875,
      "logps/rejected": -265.02374267578125,
      "loss": 0.2995,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.12015601992607117,
      "rewards/margins": 0.5488572120666504,
      "rewards/rejected": -0.42870116233825684,
      "step": 58
    },
    {
      "debug/policy_chosen_logits": -3.1211562156677246,
      "debug/policy_chosen_logps": -186.32528686523438,
      "debug/policy_rejected_logits": -3.046443462371826,
      "debug/policy_rejected_logps": -230.16293334960938,
      "debug/reference_chosen_logps": -207.93145751953125,
      "debug/reference_rejected_logps": -210.10281372070312,
      "epoch": 0.7023809523809523,
      "grad_norm": 3.5427381092079506,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1211562156677246,
      "logits/rejected": -3.046443462371826,
      "logps/chosen": -186.32528686523438,
      "logps/rejected": -230.16293334960938,
      "loss": 0.3172,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.21606168150901794,
      "rewards/margins": 0.4166628122329712,
      "rewards/rejected": -0.20060113072395325,
      "step": 59
    },
    {
      "debug/policy_chosen_logits": -3.099186420440674,
      "debug/policy_chosen_logps": -179.74508666992188,
      "debug/policy_rejected_logits": -3.098525047302246,
      "debug/policy_rejected_logps": -218.37451171875,
      "debug/reference_chosen_logps": -204.20477294921875,
      "debug/reference_rejected_logps": -208.37628173828125,
      "epoch": 0.7142857142857143,
      "grad_norm": 3.822141253561726,
      "learning_rate": 1e-06,
      "logits/chosen": -3.099186420440674,
      "logits/rejected": -3.098525047302246,
      "logps/chosen": -179.74508666992188,
      "logps/rejected": -218.37451171875,
      "loss": 0.2797,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.2445967197418213,
      "rewards/margins": 0.3445791006088257,
      "rewards/rejected": -0.0999823734164238,
      "step": 60
    },
    {
      "debug/policy_chosen_logits": -3.1759068965911865,
      "debug/policy_chosen_logps": -189.11534118652344,
      "debug/policy_rejected_logits": -3.130244255065918,
      "debug/policy_rejected_logps": -233.64544677734375,
      "debug/reference_chosen_logps": -214.55841064453125,
      "debug/reference_rejected_logps": -213.53517150878906,
      "epoch": 0.7261904761904762,
      "grad_norm": 3.7708296837344957,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1759068965911865,
      "logits/rejected": -3.130244255065918,
      "logps/chosen": -189.11534118652344,
      "logps/rejected": -233.64544677734375,
      "loss": 0.2868,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.25443071126937866,
      "rewards/margins": 0.45553332567214966,
      "rewards/rejected": -0.201102614402771,
      "step": 61
    },
    {
      "debug/policy_chosen_logits": -3.1119725704193115,
      "debug/policy_chosen_logps": -178.9134063720703,
      "debug/policy_rejected_logits": -3.0561046600341797,
      "debug/policy_rejected_logps": -235.5465545654297,
      "debug/reference_chosen_logps": -201.8980712890625,
      "debug/reference_rejected_logps": -204.71258544921875,
      "epoch": 0.7380952380952381,
      "grad_norm": 3.6489245962660175,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1119725704193115,
      "logits/rejected": -3.0561046600341797,
      "logps/chosen": -178.9134063720703,
      "logps/rejected": -235.5465545654297,
      "loss": 0.2636,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.22984656691551208,
      "rewards/margins": 0.5381861925125122,
      "rewards/rejected": -0.3083396553993225,
      "step": 62
    },
    {
      "debug/policy_chosen_logits": -3.1469156742095947,
      "debug/policy_chosen_logps": -201.1051025390625,
      "debug/policy_rejected_logits": -3.169464111328125,
      "debug/policy_rejected_logps": -232.58746337890625,
      "debug/reference_chosen_logps": -221.8560028076172,
      "debug/reference_rejected_logps": -219.1409149169922,
      "epoch": 0.75,
      "grad_norm": 4.929161871830666,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1469156742095947,
      "logits/rejected": -3.169464111328125,
      "logps/chosen": -201.1051025390625,
      "logps/rejected": -232.58746337890625,
      "loss": 0.2553,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.20750907063484192,
      "rewards/margins": 0.34197473526000977,
      "rewards/rejected": -0.13446564972400665,
      "step": 63
    },
    {
      "debug/policy_chosen_logits": -3.264002799987793,
      "debug/policy_chosen_logps": -176.3603515625,
      "debug/policy_rejected_logits": -3.2041733264923096,
      "debug/policy_rejected_logps": -237.74607849121094,
      "debug/reference_chosen_logps": -194.56317138671875,
      "debug/reference_rejected_logps": -220.99493408203125,
      "epoch": 0.7619047619047619,
      "grad_norm": 3.818382564446503,
      "learning_rate": 1e-06,
      "logits/chosen": -3.264002799987793,
      "logits/rejected": -3.2041733264923096,
      "logps/chosen": -176.3603515625,
      "logps/rejected": -237.74607849121094,
      "loss": 0.2587,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.18202824890613556,
      "rewards/margins": 0.3495399057865143,
      "rewards/rejected": -0.16751165688037872,
      "step": 64
    },
    {
      "debug/policy_chosen_logits": -3.119412899017334,
      "debug/policy_chosen_logps": -203.42031860351562,
      "debug/policy_rejected_logits": -3.1157796382904053,
      "debug/policy_rejected_logps": -233.96710205078125,
      "debug/reference_chosen_logps": -222.92288208007812,
      "debug/reference_rejected_logps": -212.2803955078125,
      "epoch": 0.7738095238095238,
      "grad_norm": 3.2445663415245227,
      "learning_rate": 1e-06,
      "logits/chosen": -3.119412899017334,
      "logits/rejected": -3.1157796382904053,
      "logps/chosen": -203.42031860351562,
      "logps/rejected": -233.96710205078125,
      "loss": 0.2657,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.19502560794353485,
      "rewards/margins": 0.41189298033714294,
      "rewards/rejected": -0.2168673723936081,
      "step": 65
    },
    {
      "debug/policy_chosen_logits": -3.1231942176818848,
      "debug/policy_chosen_logps": -187.53953552246094,
      "debug/policy_rejected_logits": -2.986668109893799,
      "debug/policy_rejected_logps": -240.6594696044922,
      "debug/reference_chosen_logps": -203.42880249023438,
      "debug/reference_rejected_logps": -214.08140563964844,
      "epoch": 0.7857142857142857,
      "grad_norm": 3.993146507021109,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1231942176818848,
      "logits/rejected": -2.986668109893799,
      "logps/chosen": -187.53953552246094,
      "logps/rejected": -240.6594696044922,
      "loss": 0.2497,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.15889252722263336,
      "rewards/margins": 0.4246731102466583,
      "rewards/rejected": -0.26578059792518616,
      "step": 66
    },
    {
      "debug/policy_chosen_logits": -3.1682167053222656,
      "debug/policy_chosen_logps": -205.82066345214844,
      "debug/policy_rejected_logits": -3.2192001342773438,
      "debug/policy_rejected_logps": -218.02267456054688,
      "debug/reference_chosen_logps": -218.94798278808594,
      "debug/reference_rejected_logps": -207.19180297851562,
      "epoch": 0.7976190476190477,
      "grad_norm": 4.631060327525655,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1682167053222656,
      "logits/rejected": -3.2192001342773438,
      "logps/chosen": -205.82066345214844,
      "logps/rejected": -218.02267456054688,
      "loss": 0.256,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.1312730759382248,
      "rewards/margins": 0.23958177864551544,
      "rewards/rejected": -0.10830870270729065,
      "step": 67
    },
    {
      "debug/policy_chosen_logits": -3.203059196472168,
      "debug/policy_chosen_logps": -178.87045288085938,
      "debug/policy_rejected_logits": -2.987100839614868,
      "debug/policy_rejected_logps": -232.19949340820312,
      "debug/reference_chosen_logps": -203.59388732910156,
      "debug/reference_rejected_logps": -210.4298553466797,
      "epoch": 0.8095238095238095,
      "grad_norm": 3.521359819189684,
      "learning_rate": 1e-06,
      "logits/chosen": -3.203059196472168,
      "logits/rejected": -2.987100839614868,
      "logps/chosen": -178.87045288085938,
      "logps/rejected": -232.19949340820312,
      "loss": 0.3033,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.24723434448242188,
      "rewards/margins": 0.4649306535720825,
      "rewards/rejected": -0.21769630908966064,
      "step": 68
    },
    {
      "debug/policy_chosen_logits": -3.0755350589752197,
      "debug/policy_chosen_logps": -187.50741577148438,
      "debug/policy_rejected_logits": -3.0124213695526123,
      "debug/policy_rejected_logps": -249.99533081054688,
      "debug/reference_chosen_logps": -211.29078674316406,
      "debug/reference_rejected_logps": -215.66058349609375,
      "epoch": 0.8214285714285714,
      "grad_norm": 4.192433976871342,
      "learning_rate": 1e-06,
      "logits/chosen": -3.0755350589752197,
      "logits/rejected": -3.0124213695526123,
      "logps/chosen": -187.50741577148438,
      "logps/rejected": -249.99533081054688,
      "loss": 0.2695,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.23783370852470398,
      "rewards/margins": 0.5811812281608582,
      "rewards/rejected": -0.3433475196361542,
      "step": 69
    },
    {
      "debug/policy_chosen_logits": -3.1850805282592773,
      "debug/policy_chosen_logps": -188.49639892578125,
      "debug/policy_rejected_logits": -3.084040880203247,
      "debug/policy_rejected_logps": -220.804931640625,
      "debug/reference_chosen_logps": -212.1209716796875,
      "debug/reference_rejected_logps": -211.52203369140625,
      "epoch": 0.8333333333333334,
      "grad_norm": 3.2739721314248973,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1850805282592773,
      "logits/rejected": -3.084040880203247,
      "logps/chosen": -188.49639892578125,
      "logps/rejected": -220.804931640625,
      "loss": 0.267,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.23624567687511444,
      "rewards/margins": 0.32907477021217346,
      "rewards/rejected": -0.09282909333705902,
      "step": 70
    },
    {
      "debug/policy_chosen_logits": -3.096564292907715,
      "debug/policy_chosen_logps": -207.81924438476562,
      "debug/policy_rejected_logits": -3.0242645740509033,
      "debug/policy_rejected_logps": -229.8334503173828,
      "debug/reference_chosen_logps": -227.1575164794922,
      "debug/reference_rejected_logps": -214.90184020996094,
      "epoch": 0.8452380952380952,
      "grad_norm": 3.0819588522165686,
      "learning_rate": 1e-06,
      "logits/chosen": -3.096564292907715,
      "logits/rejected": -3.0242645740509033,
      "logps/chosen": -207.81924438476562,
      "logps/rejected": -229.8334503173828,
      "loss": 0.2744,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.1933828443288803,
      "rewards/margins": 0.34269896149635315,
      "rewards/rejected": -0.14931611716747284,
      "step": 71
    },
    {
      "debug/policy_chosen_logits": -3.1674182415008545,
      "debug/policy_chosen_logps": -192.69192504882812,
      "debug/policy_rejected_logits": -3.0628364086151123,
      "debug/policy_rejected_logps": -225.1174774169922,
      "debug/reference_chosen_logps": -215.3536376953125,
      "debug/reference_rejected_logps": -209.8455352783203,
      "epoch": 0.8571428571428571,
      "grad_norm": 3.773640551428593,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1674182415008545,
      "logits/rejected": -3.0628364086151123,
      "logps/chosen": -192.69192504882812,
      "logps/rejected": -225.1174774169922,
      "loss": 0.2596,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.2266169786453247,
      "rewards/margins": 0.37933632731437683,
      "rewards/rejected": -0.15271936357021332,
      "step": 72
    },
    {
      "debug/policy_chosen_logits": -3.077094793319702,
      "debug/policy_chosen_logps": -187.55560302734375,
      "debug/policy_rejected_logits": -3.0116915702819824,
      "debug/policy_rejected_logps": -248.090087890625,
      "debug/reference_chosen_logps": -208.9130096435547,
      "debug/reference_rejected_logps": -221.7311248779297,
      "epoch": 0.8690476190476191,
      "grad_norm": 3.2776627025025333,
      "learning_rate": 1e-06,
      "logits/chosen": -3.077094793319702,
      "logits/rejected": -3.0116915702819824,
      "logps/chosen": -187.55560302734375,
      "logps/rejected": -248.090087890625,
      "loss": 0.2596,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.21357415616512299,
      "rewards/margins": 0.47716373205184937,
      "rewards/rejected": -0.2635895609855652,
      "step": 73
    },
    {
      "debug/policy_chosen_logits": -3.182755470275879,
      "debug/policy_chosen_logps": -205.19509887695312,
      "debug/policy_rejected_logits": -2.9780564308166504,
      "debug/policy_rejected_logps": -253.59747314453125,
      "debug/reference_chosen_logps": -220.27328491210938,
      "debug/reference_rejected_logps": -215.6810302734375,
      "epoch": 0.8809523809523809,
      "grad_norm": 3.5105144029616446,
      "learning_rate": 1e-06,
      "logits/chosen": -3.182755470275879,
      "logits/rejected": -2.9780564308166504,
      "logps/chosen": -205.19509887695312,
      "logps/rejected": -253.59747314453125,
      "loss": 0.3096,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.1507818102836609,
      "rewards/margins": 0.5299463272094727,
      "rewards/rejected": -0.37916454672813416,
      "step": 74
    },
    {
      "debug/policy_chosen_logits": -3.0349855422973633,
      "debug/policy_chosen_logps": -197.20550537109375,
      "debug/policy_rejected_logits": -2.968355178833008,
      "debug/policy_rejected_logps": -260.9197998046875,
      "debug/reference_chosen_logps": -217.93182373046875,
      "debug/reference_rejected_logps": -250.27139282226562,
      "epoch": 0.8928571428571429,
      "grad_norm": 5.618897079839302,
      "learning_rate": 1e-06,
      "logits/chosen": -3.0349855422973633,
      "logits/rejected": -2.968355178833008,
      "logps/chosen": -197.20550537109375,
      "logps/rejected": -260.9197998046875,
      "loss": 0.3018,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.20726338028907776,
      "rewards/margins": 0.3137475848197937,
      "rewards/rejected": -0.10648422688245773,
      "step": 75
    },
    {
      "debug/policy_chosen_logits": -3.1911251544952393,
      "debug/policy_chosen_logps": -184.67440795898438,
      "debug/policy_rejected_logits": -3.182987928390503,
      "debug/policy_rejected_logps": -210.40711975097656,
      "debug/reference_chosen_logps": -203.4073486328125,
      "debug/reference_rejected_logps": -204.89013671875,
      "epoch": 0.9047619047619048,
      "grad_norm": 3.696876592657335,
      "learning_rate": 1e-06,
      "logits/chosen": -3.1911251544952393,
      "logits/rejected": -3.182987928390503,
      "logps/chosen": -184.67440795898438,
      "logps/rejected": -210.40711975097656,
      "loss": 0.3278,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.1873292624950409,
      "rewards/margins": 0.24249891936779022,
      "rewards/rejected": -0.05516962707042694,
      "step": 76
    },
    {
      "debug/policy_chosen_logits": -3.178257465362549,
      "debug/policy_chosen_logps": -187.74302673339844,
      "debug/policy_rejected_logits": -3.0253536701202393,
      "debug/policy_rejected_logps": -255.27618408203125,
      "debug/reference_chosen_logps": -204.26242065429688,
      "debug/reference_rejected_logps": -219.57601928710938,
      "epoch": 0.9166666666666666,
      "grad_norm": 6.676177800896766,
      "learning_rate": 1e-06,
      "logits/chosen": -3.178257465362549,
      "logits/rejected": -3.0253536701202393,
      "logps/chosen": -187.74302673339844,
      "logps/rejected": -255.27618408203125,
      "loss": 0.2808,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.16519393026828766,
      "rewards/margins": 0.5221953988075256,
      "rewards/rejected": -0.35700148344039917,
      "step": 77
    },
    {
      "debug/policy_chosen_logits": -3.172184705734253,
      "debug/policy_chosen_logps": -197.2301788330078,
      "debug/policy_rejected_logits": -3.122185230255127,
      "debug/policy_rejected_logps": -227.55764770507812,
      "debug/reference_chosen_logps": -208.9789276123047,
      "debug/reference_rejected_logps": -220.4404296875,
      "epoch": 0.9285714285714286,
      "grad_norm": 3.121642412627353,
      "learning_rate": 1e-06,
      "logits/chosen": -3.172184705734253,
      "logits/rejected": -3.122185230255127,
      "logps/chosen": -197.2301788330078,
      "logps/rejected": -227.55764770507812,
      "loss": 0.2861,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.11748749017715454,
      "rewards/margins": 0.18865975737571716,
      "rewards/rejected": -0.07117227464914322,
      "step": 78
    },
    {
      "debug/policy_chosen_logits": -3.149040937423706,
      "debug/policy_chosen_logps": -180.52720642089844,
      "debug/policy_rejected_logits": -3.1451313495635986,
      "debug/policy_rejected_logps": -242.54115295410156,
      "debug/reference_chosen_logps": -205.92019653320312,
      "debug/reference_rejected_logps": -219.41590881347656,
      "epoch": 0.9404761904761905,
      "grad_norm": 3.1134708691492845,
      "learning_rate": 1e-06,
      "logits/chosen": -3.149040937423706,
      "logits/rejected": -3.1451313495635986,
      "logps/chosen": -180.52720642089844,
      "logps/rejected": -242.54115295410156,
      "loss": 0.2408,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.25392991304397583,
      "rewards/margins": 0.4851823151111603,
      "rewards/rejected": -0.23125241696834564,
      "step": 79
    },
    {
      "debug/policy_chosen_logits": -3.215009927749634,
      "debug/policy_chosen_logps": -161.9908447265625,
      "debug/policy_rejected_logits": -2.9049410820007324,
      "debug/policy_rejected_logps": -237.18609619140625,
      "debug/reference_chosen_logps": -198.1100311279297,
      "debug/reference_rejected_logps": -205.93499755859375,
      "epoch": 0.9523809523809523,
      "grad_norm": 6.27678156865478,
      "learning_rate": 1e-06,
      "logits/chosen": -3.215009927749634,
      "logits/rejected": -2.9049410820007324,
      "logps/chosen": -161.9908447265625,
      "logps/rejected": -237.18609619140625,
      "loss": 0.3087,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.361191987991333,
      "rewards/margins": 0.6737030744552612,
      "rewards/rejected": -0.3125110864639282,
      "step": 80
    },
    {
      "debug/policy_chosen_logits": -3.0824358463287354,
      "debug/policy_chosen_logps": -185.10894775390625,
      "debug/policy_rejected_logits": -2.936793804168701,
      "debug/policy_rejected_logps": -268.28875732421875,
      "debug/reference_chosen_logps": -200.82766723632812,
      "debug/reference_rejected_logps": -233.51441955566406,
      "epoch": 0.9642857142857143,
      "grad_norm": 3.3522970674177937,
      "learning_rate": 1e-06,
      "logits/chosen": -3.0824358463287354,
      "logits/rejected": -2.936793804168701,
      "logps/chosen": -185.10894775390625,
      "logps/rejected": -268.28875732421875,
      "loss": 0.2521,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.15718725323677063,
      "rewards/margins": 0.5049305558204651,
      "rewards/rejected": -0.34774333238601685,
      "step": 81
    },
    {
      "debug/policy_chosen_logits": -3.135251760482788,
      "debug/policy_chosen_logps": -170.79705810546875,
      "debug/policy_rejected_logits": -3.0056710243225098,
      "debug/policy_rejected_logps": -227.4779052734375,
      "debug/reference_chosen_logps": -200.08804321289062,
      "debug/reference_rejected_logps": -197.99270629882812,
      "epoch": 0.9761904761904762,
      "grad_norm": 3.284253852501879,
      "learning_rate": 1e-06,
      "logits/chosen": -3.135251760482788,
      "logits/rejected": -3.0056710243225098,
      "logps/chosen": -170.79705810546875,
      "logps/rejected": -227.4779052734375,
      "loss": 0.2355,
      "rewards/accuracies": 0.875,
      "rewards/chosen": 0.2929098308086395,
      "rewards/margins": 0.5877617597579956,
      "rewards/rejected": -0.2948519289493561,
      "step": 82
    },
    {
      "debug/policy_chosen_logits": -3.0462417602539062,
      "debug/policy_chosen_logps": -195.46688842773438,
      "debug/policy_rejected_logits": -2.9659104347229004,
      "debug/policy_rejected_logps": -253.25881958007812,
      "debug/reference_chosen_logps": -216.95620727539062,
      "debug/reference_rejected_logps": -213.9208984375,
      "epoch": 0.9880952380952381,
      "grad_norm": 3.3816029123776024,
      "learning_rate": 1e-06,
      "logits/chosen": -3.0462417602539062,
      "logits/rejected": -2.9659104347229004,
      "logps/chosen": -195.46688842773438,
      "logps/rejected": -253.25881958007812,
      "loss": 0.2725,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.21489323675632477,
      "rewards/margins": 0.6082723140716553,
      "rewards/rejected": -0.3933790922164917,
      "step": 83
    },
    {
      "debug/policy_chosen_logits": -3.181661605834961,
      "debug/policy_chosen_logps": -201.0180206298828,
      "debug/policy_rejected_logits": -3.114567518234253,
      "debug/policy_rejected_logps": -248.83258056640625,
      "debug/reference_chosen_logps": -217.26596069335938,
      "debug/reference_rejected_logps": -230.24380493164062,
      "epoch": 1.0,
      "grad_norm": 3.6750878689595683,
      "learning_rate": 1e-06,
      "logits/chosen": -3.181661605834961,
      "logits/rejected": -3.114567518234253,
      "logps/chosen": -201.0180206298828,
      "logps/rejected": -248.83258056640625,
      "loss": 0.2642,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.16247960925102234,
      "rewards/margins": 0.3483673632144928,
      "rewards/rejected": -0.18588775396347046,
      "step": 84
    },
    {
      "epoch": 1.0,
      "step": 84,
      "total_flos": 0.0,
      "train_loss": 0.33750108753641445,
      "train_runtime": 246.5194,
      "train_samples_per_second": 21.613,
      "train_steps_per_second": 0.341
    }
  ],
  "logging_steps": 1,
  "max_steps": 84,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}