souging commited on
Commit
8f11400
·
verified ·
1 Parent(s): aa1554d

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:80511fa69b78427062f5dacff78510fd85d8c97429269198dd63544d1b02c1b7
3
  size 341141032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6fd325c9c8f4d7dde06717edc7511070769418008c28d00119c090d6f0b7f68
3
  size 341141032
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc232cd33d5b2c20be632ba576eee501a40579768b1a0340d9f4445d5260f899
3
  size 682673458
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1348db24f3e478f81922e312fe908cb739c4ee9e150ad2f036ab69be712443a9
3
  size 682673458
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71822174d96d135fc0b9f223212f55b7edf1ef1a0a0d835e55d01eac85ab2f72
3
  size 16048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8829ed70b2030302656436ad3ce9c06239dc272781dfeb2182af49dba9382ced
3
  size 16048
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be4680e97d3a9e0871eaabeb03847c28abda43c3c1bd154fe6b5422614b6d297
3
  size 16048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07ca875b62a1014d462f3dfbedd9672aacac54f278aad3dbfa56502e6d8980ab
3
  size 16048
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eba37faedf75f6fd1d456545a4ba0c5ac2d9af2a1f6fa4a76be4be9f08e32332
3
  size 16048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c68c3fb6c675fb82352432cbb00106f3cc3d144b27bc31460fa7671329b310e
3
  size 16048
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87b25e0e7c17435010f4f72ba988546bebe35fcdf164a465b8831a4b00564ec1
3
  size 16048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:995034fad8cd98fd8cc7b08e9388f5c2c3ffc6f5ffc8b38220e45ff9f33fe319
3
  size 16048
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5d33031e10ce21e8b98807574aa705a2f51427de06fae4f92633256df4abfa9
3
  size 16048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ff4478e39984dc2091dfafc378a1bae8f663f5ae6eb146d15efe3a27b8e0984
3
  size 16048
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2f5232d002427e6f13c513ada2abcd0b67df496ceb133569ccef6be1add8249
3
  size 16048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac6cbcc62b40b33b44f268770d73327c7327607c19cfdd79854e8fb00ff2459d
3
  size 16048
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:78ec8bd95ebe73d244465d6aea42b4a22c575bb5c107734ebbe57bebfcfa5e1e
3
  size 16048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd4125a97cf9417ec9195cefebfe2739888b6110a9ec0efda3deecf9d22e99e1
3
  size 16048
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79725f1e86991cb73c3f4c1e26c2ad9229e68bb920f9fe25cffd4abbb1ba1468
3
  size 16048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f72bd0d8d41bcc931626834be67ca7ce72d528b57449f01d7b596f92453186a
3
  size 16048
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:92f7d6c7692f3a66ef5b01cc1469ac8ce65f6b72724476d8637bb6fd7a582e68
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa3cb9fdd7c70b3d8d2084f26b1ba813283d06e83c4489733dc372cf0bfcf36f
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 200,
3
  "best_metric": 0.09608737379312515,
4
  "best_model_checkpoint": "miner_id_24/checkpoint-200",
5
- "epoch": 4.2513089005235605,
6
  "eval_steps": 100,
7
- "global_step": 200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1432,6 +1432,714 @@
1432
  "eval_samples_per_second": 42.213,
1433
  "eval_steps_per_second": 0.331,
1434
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1435
  }
1436
  ],
1437
  "logging_steps": 1,
@@ -1446,7 +2154,7 @@
1446
  "early_stopping_threshold": 0.0
1447
  },
1448
  "attributes": {
1449
- "early_stopping_patience_counter": 0
1450
  }
1451
  },
1452
  "TrainerControl": {
@@ -1455,12 +2163,12 @@
1455
  "should_evaluate": false,
1456
  "should_log": false,
1457
  "should_save": true,
1458
- "should_training_stop": false
1459
  },
1460
  "attributes": {}
1461
  }
1462
  },
1463
- "total_flos": 5.381203205633868e+18,
1464
  "train_batch_size": 3,
1465
  "trial_name": null,
1466
  "trial_params": null
 
2
  "best_global_step": 200,
3
  "best_metric": 0.09608737379312515,
4
  "best_model_checkpoint": "miner_id_24/checkpoint-200",
5
+ "epoch": 6.37696335078534,
6
  "eval_steps": 100,
7
+ "global_step": 300,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1432
  "eval_samples_per_second": 42.213,
1433
  "eval_steps_per_second": 0.331,
1434
  "step": 200
1435
+ },
1436
+ {
1437
+ "epoch": 4.272251308900524,
1438
+ "grad_norm": 0.10457887500524521,
1439
+ "learning_rate": 0.0001654277183310921,
1440
+ "loss": 0.082,
1441
+ "step": 201
1442
+ },
1443
+ {
1444
+ "epoch": 4.293193717277487,
1445
+ "grad_norm": 0.10059670358896255,
1446
+ "learning_rate": 0.00016478324743627101,
1447
+ "loss": 0.0858,
1448
+ "step": 202
1449
+ },
1450
+ {
1451
+ "epoch": 4.31413612565445,
1452
+ "grad_norm": 0.11438459903001785,
1453
+ "learning_rate": 0.00016413410611644825,
1454
+ "loss": 0.084,
1455
+ "step": 203
1456
+ },
1457
+ {
1458
+ "epoch": 4.335078534031414,
1459
+ "grad_norm": 0.06561236828565598,
1460
+ "learning_rate": 0.00016348034117023258,
1461
+ "loss": 0.0822,
1462
+ "step": 204
1463
+ },
1464
+ {
1465
+ "epoch": 4.356020942408377,
1466
+ "grad_norm": 0.11459755152463913,
1467
+ "learning_rate": 0.00016282199972956425,
1468
+ "loss": 0.0826,
1469
+ "step": 205
1470
+ },
1471
+ {
1472
+ "epoch": 4.37696335078534,
1473
+ "grad_norm": 0.07275859266519547,
1474
+ "learning_rate": 0.00016215912925631723,
1475
+ "loss": 0.081,
1476
+ "step": 206
1477
+ },
1478
+ {
1479
+ "epoch": 4.397905759162303,
1480
+ "grad_norm": 0.10688365250825882,
1481
+ "learning_rate": 0.00016149177753887746,
1482
+ "loss": 0.0804,
1483
+ "step": 207
1484
+ },
1485
+ {
1486
+ "epoch": 4.418848167539267,
1487
+ "grad_norm": 0.07950358837842941,
1488
+ "learning_rate": 0.00016081999268869766,
1489
+ "loss": 0.0817,
1490
+ "step": 208
1491
+ },
1492
+ {
1493
+ "epoch": 4.439790575916231,
1494
+ "grad_norm": 0.09884137660264969,
1495
+ "learning_rate": 0.00016014382313682881,
1496
+ "loss": 0.0818,
1497
+ "step": 209
1498
+ },
1499
+ {
1500
+ "epoch": 4.460732984293194,
1501
+ "grad_norm": 0.09773270040750504,
1502
+ "learning_rate": 0.00015946331763042867,
1503
+ "loss": 0.0841,
1504
+ "step": 210
1505
+ },
1506
+ {
1507
+ "epoch": 4.481675392670157,
1508
+ "grad_norm": 0.07958532869815826,
1509
+ "learning_rate": 0.00015877852522924732,
1510
+ "loss": 0.081,
1511
+ "step": 211
1512
+ },
1513
+ {
1514
+ "epoch": 4.50261780104712,
1515
+ "grad_norm": 0.10386484861373901,
1516
+ "learning_rate": 0.0001580894953020904,
1517
+ "loss": 0.0821,
1518
+ "step": 212
1519
+ },
1520
+ {
1521
+ "epoch": 4.523560209424084,
1522
+ "grad_norm": 0.0741114616394043,
1523
+ "learning_rate": 0.00015739627752325996,
1524
+ "loss": 0.081,
1525
+ "step": 213
1526
+ },
1527
+ {
1528
+ "epoch": 4.544502617801047,
1529
+ "grad_norm": 0.10303428024053574,
1530
+ "learning_rate": 0.00015669892186897318,
1531
+ "loss": 0.0811,
1532
+ "step": 214
1533
+ },
1534
+ {
1535
+ "epoch": 4.56544502617801,
1536
+ "grad_norm": 0.07302123308181763,
1537
+ "learning_rate": 0.00015599747861375955,
1538
+ "loss": 0.0824,
1539
+ "step": 215
1540
+ },
1541
+ {
1542
+ "epoch": 4.5863874345549736,
1543
+ "grad_norm": 0.09889702498912811,
1544
+ "learning_rate": 0.00015529199832683635,
1545
+ "loss": 0.0798,
1546
+ "step": 216
1547
+ },
1548
+ {
1549
+ "epoch": 4.607329842931938,
1550
+ "grad_norm": 0.0839948058128357,
1551
+ "learning_rate": 0.00015458253186846301,
1552
+ "loss": 0.084,
1553
+ "step": 217
1554
+ },
1555
+ {
1556
+ "epoch": 4.628272251308901,
1557
+ "grad_norm": 0.07695591449737549,
1558
+ "learning_rate": 0.0001538691303862744,
1559
+ "loss": 0.0823,
1560
+ "step": 218
1561
+ },
1562
+ {
1563
+ "epoch": 4.649214659685864,
1564
+ "grad_norm": 0.09040986001491547,
1565
+ "learning_rate": 0.0001531518453115934,
1566
+ "loss": 0.0783,
1567
+ "step": 219
1568
+ },
1569
+ {
1570
+ "epoch": 4.670157068062827,
1571
+ "grad_norm": 0.09041474014520645,
1572
+ "learning_rate": 0.00015243072835572318,
1573
+ "loss": 0.0825,
1574
+ "step": 220
1575
+ },
1576
+ {
1577
+ "epoch": 4.69109947643979,
1578
+ "grad_norm": 0.08503681421279907,
1579
+ "learning_rate": 0.00015170583150621905,
1580
+ "loss": 0.0818,
1581
+ "step": 221
1582
+ },
1583
+ {
1584
+ "epoch": 4.712041884816754,
1585
+ "grad_norm": 0.08206664770841599,
1586
+ "learning_rate": 0.00015097720702314055,
1587
+ "loss": 0.0799,
1588
+ "step": 222
1589
+ },
1590
+ {
1591
+ "epoch": 4.732984293193717,
1592
+ "grad_norm": 0.08691777288913727,
1593
+ "learning_rate": 0.00015024490743528393,
1594
+ "loss": 0.0818,
1595
+ "step": 223
1596
+ },
1597
+ {
1598
+ "epoch": 4.7539267015706805,
1599
+ "grad_norm": 0.07108013331890106,
1600
+ "learning_rate": 0.00014950898553639505,
1601
+ "loss": 0.0796,
1602
+ "step": 224
1603
+ },
1604
+ {
1605
+ "epoch": 4.774869109947644,
1606
+ "grad_norm": 0.09734012186527252,
1607
+ "learning_rate": 0.00014876949438136347,
1608
+ "loss": 0.0848,
1609
+ "step": 225
1610
+ },
1611
+ {
1612
+ "epoch": 4.795811518324607,
1613
+ "grad_norm": 0.07660133391618729,
1614
+ "learning_rate": 0.00014802648728239742,
1615
+ "loss": 0.0823,
1616
+ "step": 226
1617
+ },
1618
+ {
1619
+ "epoch": 4.816753926701571,
1620
+ "grad_norm": 0.0771099254488945,
1621
+ "learning_rate": 0.0001472800178051805,
1622
+ "loss": 0.0816,
1623
+ "step": 227
1624
+ },
1625
+ {
1626
+ "epoch": 4.837696335078534,
1627
+ "grad_norm": 0.08631302416324615,
1628
+ "learning_rate": 0.00014653013976500975,
1629
+ "loss": 0.0824,
1630
+ "step": 228
1631
+ },
1632
+ {
1633
+ "epoch": 4.858638743455497,
1634
+ "grad_norm": 0.07344726473093033,
1635
+ "learning_rate": 0.00014577690722291622,
1636
+ "loss": 0.0785,
1637
+ "step": 229
1638
+ },
1639
+ {
1640
+ "epoch": 4.879581151832461,
1641
+ "grad_norm": 0.08363424241542816,
1642
+ "learning_rate": 0.00014502037448176734,
1643
+ "loss": 0.0796,
1644
+ "step": 230
1645
+ },
1646
+ {
1647
+ "epoch": 4.900523560209424,
1648
+ "grad_norm": 0.07857396453619003,
1649
+ "learning_rate": 0.00014426059608235208,
1650
+ "loss": 0.0806,
1651
+ "step": 231
1652
+ },
1653
+ {
1654
+ "epoch": 4.9214659685863875,
1655
+ "grad_norm": 0.08594755083322525,
1656
+ "learning_rate": 0.00014349762679944896,
1657
+ "loss": 0.0812,
1658
+ "step": 232
1659
+ },
1660
+ {
1661
+ "epoch": 4.942408376963351,
1662
+ "grad_norm": 0.06925872713327408,
1663
+ "learning_rate": 0.00014273152163787726,
1664
+ "loss": 0.0808,
1665
+ "step": 233
1666
+ },
1667
+ {
1668
+ "epoch": 4.963350785340314,
1669
+ "grad_norm": 0.08095414191484451,
1670
+ "learning_rate": 0.0001419623358285314,
1671
+ "loss": 0.0796,
1672
+ "step": 234
1673
+ },
1674
+ {
1675
+ "epoch": 4.984293193717278,
1676
+ "grad_norm": 0.07342205196619034,
1677
+ "learning_rate": 0.0001411901248243993,
1678
+ "loss": 0.0806,
1679
+ "step": 235
1680
+ },
1681
+ {
1682
+ "epoch": 5.020942408376963,
1683
+ "grad_norm": 0.23300093412399292,
1684
+ "learning_rate": 0.00014041494429656442,
1685
+ "loss": 0.1565,
1686
+ "step": 236
1687
+ },
1688
+ {
1689
+ "epoch": 5.041884816753926,
1690
+ "grad_norm": 0.12772393226623535,
1691
+ "learning_rate": 0.0001396368501301925,
1692
+ "loss": 0.0732,
1693
+ "step": 237
1694
+ },
1695
+ {
1696
+ "epoch": 5.06282722513089,
1697
+ "grad_norm": 0.10052972286939621,
1698
+ "learning_rate": 0.00013885589842050253,
1699
+ "loss": 0.0738,
1700
+ "step": 238
1701
+ },
1702
+ {
1703
+ "epoch": 5.0837696335078535,
1704
+ "grad_norm": 0.1047709733247757,
1705
+ "learning_rate": 0.00013807214546872256,
1706
+ "loss": 0.075,
1707
+ "step": 239
1708
+ },
1709
+ {
1710
+ "epoch": 5.104712041884817,
1711
+ "grad_norm": 0.10601655393838882,
1712
+ "learning_rate": 0.00013728564777803088,
1713
+ "loss": 0.0737,
1714
+ "step": 240
1715
+ },
1716
+ {
1717
+ "epoch": 5.12565445026178,
1718
+ "grad_norm": 0.08795251697301865,
1719
+ "learning_rate": 0.00013649646204948255,
1720
+ "loss": 0.0717,
1721
+ "step": 241
1722
+ },
1723
+ {
1724
+ "epoch": 5.146596858638744,
1725
+ "grad_norm": 0.10085717588663101,
1726
+ "learning_rate": 0.00013570464517792153,
1727
+ "loss": 0.0751,
1728
+ "step": 242
1729
+ },
1730
+ {
1731
+ "epoch": 5.167539267015707,
1732
+ "grad_norm": 0.09512262046337128,
1733
+ "learning_rate": 0.00013491025424787915,
1734
+ "loss": 0.073,
1735
+ "step": 243
1736
+ },
1737
+ {
1738
+ "epoch": 5.18848167539267,
1739
+ "grad_norm": 0.08350583910942078,
1740
+ "learning_rate": 0.0001341133465294585,
1741
+ "loss": 0.0761,
1742
+ "step": 244
1743
+ },
1744
+ {
1745
+ "epoch": 5.209424083769633,
1746
+ "grad_norm": 0.09553380310535431,
1747
+ "learning_rate": 0.00013331397947420576,
1748
+ "loss": 0.0747,
1749
+ "step": 245
1750
+ },
1751
+ {
1752
+ "epoch": 5.230366492146596,
1753
+ "grad_norm": 0.07822317630052567,
1754
+ "learning_rate": 0.00013251221071096836,
1755
+ "loss": 0.0745,
1756
+ "step": 246
1757
+ },
1758
+ {
1759
+ "epoch": 5.2513089005235605,
1760
+ "grad_norm": 0.10339541733264923,
1761
+ "learning_rate": 0.00013170809804174022,
1762
+ "loss": 0.0762,
1763
+ "step": 247
1764
+ },
1765
+ {
1766
+ "epoch": 5.272251308900524,
1767
+ "grad_norm": 0.08298144489526749,
1768
+ "learning_rate": 0.00013090169943749476,
1769
+ "loss": 0.0751,
1770
+ "step": 248
1771
+ },
1772
+ {
1773
+ "epoch": 5.293193717277487,
1774
+ "grad_norm": 0.0966297909617424,
1775
+ "learning_rate": 0.00013009307303400556,
1776
+ "loss": 0.0724,
1777
+ "step": 249
1778
+ },
1779
+ {
1780
+ "epoch": 5.31413612565445,
1781
+ "grad_norm": 0.09534008800983429,
1782
+ "learning_rate": 0.00012928227712765504,
1783
+ "loss": 0.0731,
1784
+ "step": 250
1785
+ },
1786
+ {
1787
+ "epoch": 5.335078534031414,
1788
+ "grad_norm": 0.08681947737932205,
1789
+ "learning_rate": 0.00012846937017123197,
1790
+ "loss": 0.075,
1791
+ "step": 251
1792
+ },
1793
+ {
1794
+ "epoch": 5.356020942408377,
1795
+ "grad_norm": 0.09559471905231476,
1796
+ "learning_rate": 0.00012765441076971712,
1797
+ "loss": 0.0717,
1798
+ "step": 252
1799
+ },
1800
+ {
1801
+ "epoch": 5.37696335078534,
1802
+ "grad_norm": 0.08022520691156387,
1803
+ "learning_rate": 0.00012683745767605846,
1804
+ "loss": 0.0766,
1805
+ "step": 253
1806
+ },
1807
+ {
1808
+ "epoch": 5.397905759162303,
1809
+ "grad_norm": 0.10284972935914993,
1810
+ "learning_rate": 0.0001260185697869353,
1811
+ "loss": 0.0704,
1812
+ "step": 254
1813
+ },
1814
+ {
1815
+ "epoch": 5.418848167539267,
1816
+ "grad_norm": 0.08318132907152176,
1817
+ "learning_rate": 0.00012519780613851254,
1818
+ "loss": 0.0746,
1819
+ "step": 255
1820
+ },
1821
+ {
1822
+ "epoch": 5.439790575916231,
1823
+ "grad_norm": 0.08917541056871414,
1824
+ "learning_rate": 0.00012437522590218417,
1825
+ "loss": 0.0733,
1826
+ "step": 256
1827
+ },
1828
+ {
1829
+ "epoch": 5.460732984293194,
1830
+ "grad_norm": 0.08834797143936157,
1831
+ "learning_rate": 0.00012355088838030776,
1832
+ "loss": 0.075,
1833
+ "step": 257
1834
+ },
1835
+ {
1836
+ "epoch": 5.481675392670157,
1837
+ "grad_norm": 0.08001340925693512,
1838
+ "learning_rate": 0.00012272485300192902,
1839
+ "loss": 0.0731,
1840
+ "step": 258
1841
+ },
1842
+ {
1843
+ "epoch": 5.50261780104712,
1844
+ "grad_norm": 0.07309938222169876,
1845
+ "learning_rate": 0.00012189717931849731,
1846
+ "loss": 0.0719,
1847
+ "step": 259
1848
+ },
1849
+ {
1850
+ "epoch": 5.523560209424084,
1851
+ "grad_norm": 0.07951314002275467,
1852
+ "learning_rate": 0.00012106792699957263,
1853
+ "loss": 0.0741,
1854
+ "step": 260
1855
+ },
1856
+ {
1857
+ "epoch": 5.544502617801047,
1858
+ "grad_norm": 0.07957018166780472,
1859
+ "learning_rate": 0.00012023715582852357,
1860
+ "loss": 0.0738,
1861
+ "step": 261
1862
+ },
1863
+ {
1864
+ "epoch": 5.56544502617801,
1865
+ "grad_norm": 0.076540008187294,
1866
+ "learning_rate": 0.00011940492569821753,
1867
+ "loss": 0.0714,
1868
+ "step": 262
1869
+ },
1870
+ {
1871
+ "epoch": 5.5863874345549736,
1872
+ "grad_norm": 0.08407393842935562,
1873
+ "learning_rate": 0.00011857129660670281,
1874
+ "loss": 0.0777,
1875
+ "step": 263
1876
+ },
1877
+ {
1878
+ "epoch": 5.607329842931938,
1879
+ "grad_norm": 0.0788414478302002,
1880
+ "learning_rate": 0.00011773632865288309,
1881
+ "loss": 0.0732,
1882
+ "step": 264
1883
+ },
1884
+ {
1885
+ "epoch": 5.628272251308901,
1886
+ "grad_norm": 0.0724525973200798,
1887
+ "learning_rate": 0.00011690008203218493,
1888
+ "loss": 0.0783,
1889
+ "step": 265
1890
+ },
1891
+ {
1892
+ "epoch": 5.649214659685864,
1893
+ "grad_norm": 0.0882321372628212,
1894
+ "learning_rate": 0.00011606261703221772,
1895
+ "loss": 0.0781,
1896
+ "step": 266
1897
+ },
1898
+ {
1899
+ "epoch": 5.670157068062827,
1900
+ "grad_norm": 0.07683246582746506,
1901
+ "learning_rate": 0.00011522399402842783,
1902
+ "loss": 0.0706,
1903
+ "step": 267
1904
+ },
1905
+ {
1906
+ "epoch": 5.69109947643979,
1907
+ "grad_norm": 0.07433947920799255,
1908
+ "learning_rate": 0.00011438427347974554,
1909
+ "loss": 0.074,
1910
+ "step": 268
1911
+ },
1912
+ {
1913
+ "epoch": 5.712041884816754,
1914
+ "grad_norm": 0.07308503985404968,
1915
+ "learning_rate": 0.00011354351592422665,
1916
+ "loss": 0.0729,
1917
+ "step": 269
1918
+ },
1919
+ {
1920
+ "epoch": 5.732984293193717,
1921
+ "grad_norm": 0.08603333681821823,
1922
+ "learning_rate": 0.00011270178197468789,
1923
+ "loss": 0.0752,
1924
+ "step": 270
1925
+ },
1926
+ {
1927
+ "epoch": 5.7539267015706805,
1928
+ "grad_norm": 0.07910820841789246,
1929
+ "learning_rate": 0.00011185913231433733,
1930
+ "loss": 0.0752,
1931
+ "step": 271
1932
+ },
1933
+ {
1934
+ "epoch": 5.774869109947644,
1935
+ "grad_norm": 0.07771284133195877,
1936
+ "learning_rate": 0.00011101562769239946,
1937
+ "loss": 0.0739,
1938
+ "step": 272
1939
+ },
1940
+ {
1941
+ "epoch": 5.795811518324607,
1942
+ "grad_norm": 0.08137574046850204,
1943
+ "learning_rate": 0.0001101713289197356,
1944
+ "loss": 0.0704,
1945
+ "step": 273
1946
+ },
1947
+ {
1948
+ "epoch": 5.816753926701571,
1949
+ "grad_norm": 0.07771284133195877,
1950
+ "learning_rate": 0.00010932629686445986,
1951
+ "loss": 0.0766,
1952
+ "step": 274
1953
+ },
1954
+ {
1955
+ "epoch": 5.837696335078534,
1956
+ "grad_norm": 0.07269325852394104,
1957
+ "learning_rate": 0.00010848059244755093,
1958
+ "loss": 0.0738,
1959
+ "step": 275
1960
+ },
1961
+ {
1962
+ "epoch": 5.858638743455497,
1963
+ "grad_norm": 0.09434104710817337,
1964
+ "learning_rate": 0.00010763427663846015,
1965
+ "loss": 0.0754,
1966
+ "step": 276
1967
+ },
1968
+ {
1969
+ "epoch": 5.879581151832461,
1970
+ "grad_norm": 0.07986113429069519,
1971
+ "learning_rate": 0.00010678741045071609,
1972
+ "loss": 0.0727,
1973
+ "step": 277
1974
+ },
1975
+ {
1976
+ "epoch": 5.900523560209424,
1977
+ "grad_norm": 0.08152402937412262,
1978
+ "learning_rate": 0.00010594005493752568,
1979
+ "loss": 0.0763,
1980
+ "step": 278
1981
+ },
1982
+ {
1983
+ "epoch": 5.9214659685863875,
1984
+ "grad_norm": 0.08020524680614471,
1985
+ "learning_rate": 0.00010509227118737298,
1986
+ "loss": 0.0728,
1987
+ "step": 279
1988
+ },
1989
+ {
1990
+ "epoch": 5.942408376963351,
1991
+ "grad_norm": 0.08128321915864944,
1992
+ "learning_rate": 0.00010424412031961484,
1993
+ "loss": 0.0726,
1994
+ "step": 280
1995
+ },
1996
+ {
1997
+ "epoch": 5.963350785340314,
1998
+ "grad_norm": 0.09842672944068909,
1999
+ "learning_rate": 0.00010339566348007487,
2000
+ "loss": 0.0738,
2001
+ "step": 281
2002
+ },
2003
+ {
2004
+ "epoch": 5.984293193717278,
2005
+ "grad_norm": 0.0821060761809349,
2006
+ "learning_rate": 0.00010254696183663511,
2007
+ "loss": 0.0741,
2008
+ "step": 282
2009
+ },
2010
+ {
2011
+ "epoch": 6.020942408376963,
2012
+ "grad_norm": 0.24885313212871552,
2013
+ "learning_rate": 0.00010169807657482623,
2014
+ "loss": 0.1464,
2015
+ "step": 283
2016
+ },
2017
+ {
2018
+ "epoch": 6.041884816753926,
2019
+ "grad_norm": 0.10097737610340118,
2020
+ "learning_rate": 0.00010084906889341656,
2021
+ "loss": 0.0664,
2022
+ "step": 284
2023
+ },
2024
+ {
2025
+ "epoch": 6.06282722513089,
2026
+ "grad_norm": 0.10555354505777359,
2027
+ "learning_rate": 0.0001,
2028
+ "loss": 0.0672,
2029
+ "step": 285
2030
+ },
2031
+ {
2032
+ "epoch": 6.0837696335078535,
2033
+ "grad_norm": 0.10018379241228104,
2034
+ "learning_rate": 9.915093110658346e-05,
2035
+ "loss": 0.0678,
2036
+ "step": 286
2037
+ },
2038
+ {
2039
+ "epoch": 6.104712041884817,
2040
+ "grad_norm": 0.09475495666265488,
2041
+ "learning_rate": 9.830192342517379e-05,
2042
+ "loss": 0.0662,
2043
+ "step": 287
2044
+ },
2045
+ {
2046
+ "epoch": 6.12565445026178,
2047
+ "grad_norm": 0.10079528391361237,
2048
+ "learning_rate": 9.745303816336489e-05,
2049
+ "loss": 0.0645,
2050
+ "step": 288
2051
+ },
2052
+ {
2053
+ "epoch": 6.146596858638744,
2054
+ "grad_norm": 0.08908044546842575,
2055
+ "learning_rate": 9.660433651992514e-05,
2056
+ "loss": 0.0671,
2057
+ "step": 289
2058
+ },
2059
+ {
2060
+ "epoch": 6.167539267015707,
2061
+ "grad_norm": 0.10220327973365784,
2062
+ "learning_rate": 9.57558796803852e-05,
2063
+ "loss": 0.0652,
2064
+ "step": 290
2065
+ },
2066
+ {
2067
+ "epoch": 6.18848167539267,
2068
+ "grad_norm": 0.0913078561425209,
2069
+ "learning_rate": 9.490772881262709e-05,
2070
+ "loss": 0.0654,
2071
+ "step": 291
2072
+ },
2073
+ {
2074
+ "epoch": 6.209424083769633,
2075
+ "grad_norm": 0.09089622646570206,
2076
+ "learning_rate": 9.405994506247432e-05,
2077
+ "loss": 0.0659,
2078
+ "step": 292
2079
+ },
2080
+ {
2081
+ "epoch": 6.230366492146596,
2082
+ "grad_norm": 0.10402899235486984,
2083
+ "learning_rate": 9.321258954928393e-05,
2084
+ "loss": 0.0672,
2085
+ "step": 293
2086
+ },
2087
+ {
2088
+ "epoch": 6.2513089005235605,
2089
+ "grad_norm": 0.09207270294427872,
2090
+ "learning_rate": 9.236572336153986e-05,
2091
+ "loss": 0.0688,
2092
+ "step": 294
2093
+ },
2094
+ {
2095
+ "epoch": 6.272251308900524,
2096
+ "grad_norm": 0.10593326389789581,
2097
+ "learning_rate": 9.151940755244912e-05,
2098
+ "loss": 0.0655,
2099
+ "step": 295
2100
+ },
2101
+ {
2102
+ "epoch": 6.293193717277487,
2103
+ "grad_norm": 0.09085794538259506,
2104
+ "learning_rate": 9.067370313554015e-05,
2105
+ "loss": 0.0663,
2106
+ "step": 296
2107
+ },
2108
+ {
2109
+ "epoch": 6.31413612565445,
2110
+ "grad_norm": 0.106470987200737,
2111
+ "learning_rate": 8.982867108026442e-05,
2112
+ "loss": 0.0659,
2113
+ "step": 297
2114
+ },
2115
+ {
2116
+ "epoch": 6.335078534031414,
2117
+ "grad_norm": 0.08805633336305618,
2118
+ "learning_rate": 8.898437230760058e-05,
2119
+ "loss": 0.0672,
2120
+ "step": 298
2121
+ },
2122
+ {
2123
+ "epoch": 6.356020942408377,
2124
+ "grad_norm": 0.10489070415496826,
2125
+ "learning_rate": 8.814086768566272e-05,
2126
+ "loss": 0.0665,
2127
+ "step": 299
2128
+ },
2129
+ {
2130
+ "epoch": 6.37696335078534,
2131
+ "grad_norm": 0.11271199584007263,
2132
+ "learning_rate": 8.729821802531212e-05,
2133
+ "loss": 0.0685,
2134
+ "step": 300
2135
+ },
2136
+ {
2137
+ "epoch": 6.37696335078534,
2138
+ "eval_loss": 0.09648442268371582,
2139
+ "eval_runtime": 36.2137,
2140
+ "eval_samples_per_second": 42.222,
2141
+ "eval_steps_per_second": 0.331,
2142
+ "step": 300
2143
  }
2144
  ],
2145
  "logging_steps": 1,
 
2154
  "early_stopping_threshold": 0.0
2155
  },
2156
  "attributes": {
2157
+ "early_stopping_patience_counter": 1
2158
  }
2159
  },
2160
  "TrainerControl": {
 
2163
  "should_evaluate": false,
2164
  "should_log": false,
2165
  "should_save": true,
2166
+ "should_training_stop": true
2167
  },
2168
  "attributes": {}
2169
  }
2170
  },
2171
+ "total_flos": 8.071804808450802e+18,
2172
  "train_batch_size": 3,
2173
  "trial_name": null,
2174
  "trial_params": null