aleegis commited on
Commit
ed68727
·
verified ·
1 Parent(s): dc5dcc7

Training in progress, epoch 1, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5a4db17a6b0d3aa294ab149bdab58b500912c49a77a26b84efeaa8c5ba0a8cf
3
  size 194563400
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6869b2d71e2f9a989f469926b109e48afa90079a4410cb2bc4606b2f558a88b
3
  size 194563400
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:000693a5cee3243f5f119e4269236d0545278d41e28f23ba8f0185105a641f78
3
  size 389358058
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d666fc4f02e4046fb59081b8bc0c0045b07bcfff4cdf17054f52eb6207791394
3
  size 389358058
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b22b562a9fd534225eadfb95f9c8a88b6901d71ec1a24ce1b5d77950baecd454
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb59e5d55e15032974abe5598466b524fa64fa84b0cbe8c580722b1f9d299a9b
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a6586a314fcf3dc86442a2142990493b4f7a7df2410791efb9baa5882c5e09a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:891cad020bf7bee78efa739dc10e1e4315e34b096ed70226b38590ec81d7d418
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9998866855524079,
5
  "eval_steps": 500,
6
- "global_step": 1103,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1106,6 +1106,405 @@
1106
  "learning_rate": 1.8913309610379015e-05,
1107
  "loss": 1.0895,
1108
  "step": 1099
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1109
  }
1110
  ],
1111
  "logging_steps": 7,
@@ -1120,12 +1519,12 @@
1120
  "should_evaluate": false,
1121
  "should_log": false,
1122
  "should_save": true,
1123
- "should_training_stop": false
1124
  },
1125
  "attributes": {}
1126
  }
1127
  },
1128
- "total_flos": 3.111012984516772e+17,
1129
  "train_batch_size": 2,
1130
  "trial_name": null,
1131
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.3597733711048159,
5
  "eval_steps": 500,
6
+ "global_step": 1500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1106
  "learning_rate": 1.8913309610379015e-05,
1107
  "loss": 1.0895,
1108
  "step": 1099
1109
+ },
1110
+ {
1111
+ "epoch": 1.0026062322946176,
1112
+ "grad_norm": 1.0843678712844849,
1113
+ "learning_rate": 1.8302023674591935e-05,
1114
+ "loss": 1.0332,
1115
+ "step": 1106
1116
+ },
1117
+ {
1118
+ "epoch": 1.0089518413597733,
1119
+ "grad_norm": 1.0455607175827026,
1120
+ "learning_rate": 1.7698558740156135e-05,
1121
+ "loss": 0.733,
1122
+ "step": 1113
1123
+ },
1124
+ {
1125
+ "epoch": 1.0152974504249292,
1126
+ "grad_norm": 1.5471065044403076,
1127
+ "learning_rate": 1.7103063703014372e-05,
1128
+ "loss": 0.8191,
1129
+ "step": 1120
1130
+ },
1131
+ {
1132
+ "epoch": 1.021643059490085,
1133
+ "grad_norm": 1.0794677734375,
1134
+ "learning_rate": 1.6515685492656467e-05,
1135
+ "loss": 0.6541,
1136
+ "step": 1127
1137
+ },
1138
+ {
1139
+ "epoch": 1.0279886685552408,
1140
+ "grad_norm": 1.2111717462539673,
1141
+ "learning_rate": 1.59365690358667e-05,
1142
+ "loss": 0.6832,
1143
+ "step": 1134
1144
+ },
1145
+ {
1146
+ "epoch": 1.0343342776203965,
1147
+ "grad_norm": 1.243001937866211,
1148
+ "learning_rate": 1.5365857220965275e-05,
1149
+ "loss": 0.7858,
1150
+ "step": 1141
1151
+ },
1152
+ {
1153
+ "epoch": 1.0406798866855524,
1154
+ "grad_norm": 1.0979713201522827,
1155
+ "learning_rate": 1.4803690862552755e-05,
1156
+ "loss": 0.6973,
1157
+ "step": 1148
1158
+ },
1159
+ {
1160
+ "epoch": 1.0470254957507081,
1161
+ "grad_norm": 1.1694271564483643,
1162
+ "learning_rate": 1.4250208666766235e-05,
1163
+ "loss": 0.7052,
1164
+ "step": 1155
1165
+ },
1166
+ {
1167
+ "epoch": 1.053371104815864,
1168
+ "grad_norm": 1.2045671939849854,
1169
+ "learning_rate": 1.3705547197055584e-05,
1170
+ "loss": 0.7855,
1171
+ "step": 1162
1172
+ },
1173
+ {
1174
+ "epoch": 1.0597167138810197,
1175
+ "grad_norm": 1.1639701128005981,
1176
+ "learning_rate": 1.3169840840488501e-05,
1177
+ "loss": 0.6912,
1178
+ "step": 1169
1179
+ },
1180
+ {
1181
+ "epoch": 1.0660623229461756,
1182
+ "grad_norm": 1.2501615285873413,
1183
+ "learning_rate": 1.2643221774592518e-05,
1184
+ "loss": 0.6945,
1185
+ "step": 1176
1186
+ },
1187
+ {
1188
+ "epoch": 1.0724079320113313,
1189
+ "grad_norm": 0.9910327792167664,
1190
+ "learning_rate": 1.2125819934742188e-05,
1191
+ "loss": 0.6741,
1192
+ "step": 1183
1193
+ },
1194
+ {
1195
+ "epoch": 1.0787535410764872,
1196
+ "grad_norm": 1.535058617591858,
1197
+ "learning_rate": 1.1617762982099446e-05,
1198
+ "loss": 0.7266,
1199
+ "step": 1190
1200
+ },
1201
+ {
1202
+ "epoch": 1.0850991501416432,
1203
+ "grad_norm": 1.055584192276001,
1204
+ "learning_rate": 1.1119176272115128e-05,
1205
+ "loss": 0.6585,
1206
+ "step": 1197
1207
+ },
1208
+ {
1209
+ "epoch": 1.0914447592067988,
1210
+ "grad_norm": 1.0781307220458984,
1211
+ "learning_rate": 1.0630182823599399e-05,
1212
+ "loss": 0.7043,
1213
+ "step": 1204
1214
+ },
1215
+ {
1216
+ "epoch": 1.0977903682719548,
1217
+ "grad_norm": 1.3469446897506714,
1218
+ "learning_rate": 1.0150903288368741e-05,
1219
+ "loss": 0.7075,
1220
+ "step": 1211
1221
+ },
1222
+ {
1223
+ "epoch": 1.1041359773371104,
1224
+ "grad_norm": 1.3512985706329346,
1225
+ "learning_rate": 9.681455921476839e-06,
1226
+ "loss": 0.6877,
1227
+ "step": 1218
1228
+ },
1229
+ {
1230
+ "epoch": 1.1104815864022664,
1231
+ "grad_norm": 1.4418903589248657,
1232
+ "learning_rate": 9.221956552036992e-06,
1233
+ "loss": 0.7409,
1234
+ "step": 1225
1235
+ },
1236
+ {
1237
+ "epoch": 1.116827195467422,
1238
+ "grad_norm": 1.3517122268676758,
1239
+ "learning_rate": 8.772518554642973e-06,
1240
+ "loss": 0.7251,
1241
+ "step": 1232
1242
+ },
1243
+ {
1244
+ "epoch": 1.123172804532578,
1245
+ "grad_norm": 1.2315598726272583,
1246
+ "learning_rate": 8.333252821395526e-06,
1247
+ "loss": 0.7255,
1248
+ "step": 1239
1249
+ },
1250
+ {
1251
+ "epoch": 1.1295184135977336,
1252
+ "grad_norm": 1.1964752674102783,
1253
+ "learning_rate": 7.904267734541498e-06,
1254
+ "loss": 0.6761,
1255
+ "step": 1246
1256
+ },
1257
+ {
1258
+ "epoch": 1.1358640226628895,
1259
+ "grad_norm": 1.4459127187728882,
1260
+ "learning_rate": 7.485669139732004e-06,
1261
+ "loss": 0.681,
1262
+ "step": 1253
1263
+ },
1264
+ {
1265
+ "epoch": 1.1422096317280452,
1266
+ "grad_norm": 1.2472503185272217,
1267
+ "learning_rate": 7.077560319906695e-06,
1268
+ "loss": 0.6352,
1269
+ "step": 1260
1270
+ },
1271
+ {
1272
+ "epoch": 1.1485552407932011,
1273
+ "grad_norm": 1.5689338445663452,
1274
+ "learning_rate": 6.680041969810203e-06,
1275
+ "loss": 0.729,
1276
+ "step": 1267
1277
+ },
1278
+ {
1279
+ "epoch": 1.154900849858357,
1280
+ "grad_norm": 1.3047913312911987,
1281
+ "learning_rate": 6.293212171147206e-06,
1282
+ "loss": 0.7166,
1283
+ "step": 1274
1284
+ },
1285
+ {
1286
+ "epoch": 1.1612464589235127,
1287
+ "grad_norm": 1.1112953424453735,
1288
+ "learning_rate": 5.917166368382277e-06,
1289
+ "loss": 0.7171,
1290
+ "step": 1281
1291
+ },
1292
+ {
1293
+ "epoch": 1.1675920679886684,
1294
+ "grad_norm": 1.3600448369979858,
1295
+ "learning_rate": 5.5519973451903405e-06,
1296
+ "loss": 0.8326,
1297
+ "step": 1288
1298
+ },
1299
+ {
1300
+ "epoch": 1.1739376770538243,
1301
+ "grad_norm": 1.092890977859497,
1302
+ "learning_rate": 5.197795201563743e-06,
1303
+ "loss": 0.6963,
1304
+ "step": 1295
1305
+ },
1306
+ {
1307
+ "epoch": 1.1802832861189803,
1308
+ "grad_norm": 1.1682928800582886,
1309
+ "learning_rate": 4.8546473315813856e-06,
1310
+ "loss": 0.667,
1311
+ "step": 1302
1312
+ },
1313
+ {
1314
+ "epoch": 1.186628895184136,
1315
+ "grad_norm": 1.8733927011489868,
1316
+ "learning_rate": 4.522638401845547e-06,
1317
+ "loss": 0.743,
1318
+ "step": 1309
1319
+ },
1320
+ {
1321
+ "epoch": 1.1929745042492919,
1322
+ "grad_norm": 1.3126906156539917,
1323
+ "learning_rate": 4.2018503305916775e-06,
1324
+ "loss": 0.7463,
1325
+ "step": 1316
1326
+ },
1327
+ {
1328
+ "epoch": 1.1993201133144475,
1329
+ "grad_norm": 1.325699806213379,
1330
+ "learning_rate": 3.892362267476313e-06,
1331
+ "loss": 0.6793,
1332
+ "step": 1323
1333
+ },
1334
+ {
1335
+ "epoch": 1.2056657223796035,
1336
+ "grad_norm": 1.2438085079193115,
1337
+ "learning_rate": 3.5942505740480582e-06,
1338
+ "loss": 0.7638,
1339
+ "step": 1330
1340
+ },
1341
+ {
1342
+ "epoch": 1.2120113314447591,
1343
+ "grad_norm": 1.0289039611816406,
1344
+ "learning_rate": 3.3075888049065196e-06,
1345
+ "loss": 0.7364,
1346
+ "step": 1337
1347
+ },
1348
+ {
1349
+ "epoch": 1.218356940509915,
1350
+ "grad_norm": 1.4916263818740845,
1351
+ "learning_rate": 3.03244768955383e-06,
1352
+ "loss": 0.7621,
1353
+ "step": 1344
1354
+ },
1355
+ {
1356
+ "epoch": 1.2247025495750707,
1357
+ "grad_norm": 1.6056360006332397,
1358
+ "learning_rate": 2.7688951149431595e-06,
1359
+ "loss": 0.6288,
1360
+ "step": 1351
1361
+ },
1362
+ {
1363
+ "epoch": 1.2310481586402267,
1364
+ "grad_norm": 1.4644125699996948,
1365
+ "learning_rate": 2.5169961087286974e-06,
1366
+ "loss": 0.7109,
1367
+ "step": 1358
1368
+ },
1369
+ {
1370
+ "epoch": 1.2373937677053823,
1371
+ "grad_norm": 1.2924115657806396,
1372
+ "learning_rate": 2.276812823220964e-06,
1373
+ "loss": 0.6827,
1374
+ "step": 1365
1375
+ },
1376
+ {
1377
+ "epoch": 1.2437393767705383,
1378
+ "grad_norm": 1.34138023853302,
1379
+ "learning_rate": 2.048404520051722e-06,
1380
+ "loss": 0.7187,
1381
+ "step": 1372
1382
+ },
1383
+ {
1384
+ "epoch": 1.2500849858356942,
1385
+ "grad_norm": 1.1116788387298584,
1386
+ "learning_rate": 1.8318275555520237e-06,
1387
+ "loss": 0.7556,
1388
+ "step": 1379
1389
+ },
1390
+ {
1391
+ "epoch": 1.2564305949008499,
1392
+ "grad_norm": 1.0539860725402832,
1393
+ "learning_rate": 1.6271353668471655e-06,
1394
+ "loss": 0.6949,
1395
+ "step": 1386
1396
+ },
1397
+ {
1398
+ "epoch": 1.2627762039660055,
1399
+ "grad_norm": 1.5152783393859863,
1400
+ "learning_rate": 1.4343784586718311e-06,
1401
+ "loss": 0.7225,
1402
+ "step": 1393
1403
+ },
1404
+ {
1405
+ "epoch": 1.2691218130311614,
1406
+ "grad_norm": 1.1229016780853271,
1407
+ "learning_rate": 1.2536043909088191e-06,
1408
+ "loss": 0.6508,
1409
+ "step": 1400
1410
+ },
1411
+ {
1412
+ "epoch": 1.2754674220963174,
1413
+ "grad_norm": 1.4070674180984497,
1414
+ "learning_rate": 1.0848577668543802e-06,
1415
+ "loss": 0.7195,
1416
+ "step": 1407
1417
+ },
1418
+ {
1419
+ "epoch": 1.281813031161473,
1420
+ "grad_norm": 1.3908133506774902,
1421
+ "learning_rate": 9.281802222129765e-07,
1422
+ "loss": 0.6959,
1423
+ "step": 1414
1424
+ },
1425
+ {
1426
+ "epoch": 1.288158640226629,
1427
+ "grad_norm": 1.5261811017990112,
1428
+ "learning_rate": 7.836104148243484e-07,
1429
+ "loss": 0.6707,
1430
+ "step": 1421
1431
+ },
1432
+ {
1433
+ "epoch": 1.2945042492917846,
1434
+ "grad_norm": 1.4674750566482544,
1435
+ "learning_rate": 6.511840151252169e-07,
1436
+ "loss": 0.6538,
1437
+ "step": 1428
1438
+ },
1439
+ {
1440
+ "epoch": 1.3008498583569406,
1441
+ "grad_norm": 1.7782832384109497,
1442
+ "learning_rate": 5.309336973481683e-07,
1443
+ "loss": 0.6876,
1444
+ "step": 1435
1445
+ },
1446
+ {
1447
+ "epoch": 1.3071954674220962,
1448
+ "grad_norm": 1.080048680305481,
1449
+ "learning_rate": 4.228891314597694e-07,
1450
+ "loss": 0.6361,
1451
+ "step": 1442
1452
+ },
1453
+ {
1454
+ "epoch": 1.3135410764872522,
1455
+ "grad_norm": 1.295278549194336,
1456
+ "learning_rate": 3.2707697583995167e-07,
1457
+ "loss": 0.6829,
1458
+ "step": 1449
1459
+ },
1460
+ {
1461
+ "epoch": 1.319886685552408,
1462
+ "grad_norm": 1.4356597661972046,
1463
+ "learning_rate": 2.4352087070443895e-07,
1464
+ "loss": 0.6593,
1465
+ "step": 1456
1466
+ },
1467
+ {
1468
+ "epoch": 1.3262322946175638,
1469
+ "grad_norm": 1.2196033000946045,
1470
+ "learning_rate": 1.7224143227190236e-07,
1471
+ "loss": 0.7315,
1472
+ "step": 1463
1473
+ },
1474
+ {
1475
+ "epoch": 1.3325779036827194,
1476
+ "grad_norm": 1.365475058555603,
1477
+ "learning_rate": 1.132562476771959e-07,
1478
+ "loss": 0.6959,
1479
+ "step": 1470
1480
+ },
1481
+ {
1482
+ "epoch": 1.3389235127478754,
1483
+ "grad_norm": 1.3029407262802124,
1484
+ "learning_rate": 6.657987063200533e-08,
1485
+ "loss": 0.7561,
1486
+ "step": 1477
1487
+ },
1488
+ {
1489
+ "epoch": 1.3452691218130313,
1490
+ "grad_norm": 1.1381126642227173,
1491
+ "learning_rate": 3.2223817833931805e-08,
1492
+ "loss": 0.687,
1493
+ "step": 1484
1494
+ },
1495
+ {
1496
+ "epoch": 1.351614730878187,
1497
+ "grad_norm": 1.1441847085952759,
1498
+ "learning_rate": 1.019656612492592e-08,
1499
+ "loss": 0.7373,
1500
+ "step": 1491
1501
+ },
1502
+ {
1503
+ "epoch": 1.3579603399433426,
1504
+ "grad_norm": 1.493086814880371,
1505
+ "learning_rate": 5.035503997385949e-10,
1506
+ "loss": 0.7458,
1507
+ "step": 1498
1508
  }
1509
  ],
1510
  "logging_steps": 7,
 
1519
  "should_evaluate": false,
1520
  "should_log": false,
1521
  "should_save": true,
1522
+ "should_training_stop": true
1523
  },
1524
  "attributes": {}
1525
  }
1526
  },
1527
+ "total_flos": 4.230406863197307e+17,
1528
  "train_batch_size": 2,
1529
  "trial_name": null,
1530
  "trial_params": null