aiden200 commited on
Commit
56278bd
·
verified ·
1 Parent(s): 0b8b2a3

Training in progress, step 3100

Browse files
Files changed (2) hide show
  1. adapter_model.safetensors +1 -1
  2. train.log +358 -0
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c4ba76fbc41735d0ff74c7b6d036dbd2170f764042bdc8759ae76333d2c00e81
3
  size 1204780872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44f8a7d14f22c01a14ae42fef575ccd9bfbced1fa7a387e0f0fe630a34c22899
3
  size 1204780872
train.log CHANGED
@@ -16276,3 +16276,361 @@ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device=
16276
  tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
16277
  {'train/tv_loss': None, 'train/lm_loss': 0.15130637884140016, 'train/info_loss': 0.1600653976202011, 'train/ref_loss': None, 'train/uncertainty_loss': -9.910131338983775e-05, 'train/video_loss': 0.15996628999710083, 'train/total_loss': 0.31127268075942993}
16278
  tensor(0.0755, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16276
  tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
16277
  {'train/tv_loss': None, 'train/lm_loss': 0.15130637884140016, 'train/info_loss': 0.1600653976202011, 'train/ref_loss': None, 'train/uncertainty_loss': -9.910131338983775e-05, 'train/video_loss': 0.15996628999710083, 'train/total_loss': 0.31127268075942993}
16278
  tensor(0.0755, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16279
+ [Rank 3] Trainer log: {'loss': 0.2861, 'grad_norm': 4.437143802642822, 'learning_rate': 3.0097681139307223e-09}
16280
+ [Rank 2] Trainer log: {'loss': 0.2861, 'grad_norm': 4.437143802642822, 'learning_rate': 3.0097681139307223e-09}[Rank 0] Trainer log: {'loss': 0.2861, 'grad_norm': 4.437143802642822, 'learning_rate': 3.0097681139307223e-09}
16281
+ [Rank 1] Trainer log: {'loss': 0.2861, 'grad_norm': 4.437143802642822, 'learning_rate': 3.0097681139307223e-09}
16282
+
16283
+ {'loss': 0.2861, 'grad_norm': 4.437143802642822, 'learning_rate': 3.0097681139307223e-09, 'epoch': 0.99}
16284
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
16285
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16286
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16287
+ {'train/tv_loss': 0.0001784276915714145, 'train/lm_loss': 1.5091327077243478e-05, 'train/info_loss': 1.3351262168725953e-05, 'train/ref_loss': 0.10094004124403, 'train/uncertainty_loss': -6.672072340734303e-05, 'train/video_loss': 0.10231409221887589, 'train/total_loss': 0.10232918709516525}
16288
+ tensor(0.0191, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16289
+ tensor(0.1469, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16290
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16291
+ tensor(0.4153, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16292
+ tensor(0.1078, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16293
+ {'train/tv_loss': 0.0001733818091452122, 'train/lm_loss': 2.2076342429500076e-05, 'train/info_loss': 1.5854584489716217e-05, 'train/ref_loss': 0.2894924581050873, 'train/uncertainty_loss': 0.010783981531858444, 'train/video_loss': 0.3016793429851532, 'train/total_loss': 0.3017014265060425}
16294
+ [Rank 1] Trainer log: {'loss': 0.2588, 'grad_norm': 8.357850074768066, 'learning_rate': 2.753750441613079e-09}
16295
+ [Rank 2] Trainer log: {'loss': 0.2588, 'grad_norm': 8.357850074768066, 'learning_rate': 2.753750441613079e-09}
16296
+ [Rank 0] Trainer log: {'loss': 0.2588, 'grad_norm': 8.357850074768066, 'learning_rate': 2.753750441613079e-09}[Rank 3] Trainer log: {'loss': 0.2588, 'grad_norm': 8.357850074768066, 'learning_rate': 2.753750441613079e-09}
16297
+
16298
+ {'loss': 0.2588, 'grad_norm': 8.357850074768066, 'learning_rate': 2.753750441613079e-09, 'epoch': 0.99}
16299
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
16300
+ tensor(0.0729, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16301
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16302
+ tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16303
+ {'train/tv_loss': 0.0004507274366915226, 'train/lm_loss': 2.8608183492906394e-05, 'train/info_loss': 1.740425250318367e-05, 'train/ref_loss': 0.16604295372962952, 'train/uncertainty_loss': -7.334401598200203e-05, 'train/video_loss': 0.16959282755851746, 'train/total_loss': 0.16962143778800964}
16304
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
16305
+ {'train/tv_loss': None, 'train/lm_loss': 0.15873619318008425, 'train/info_loss': 0.20619021356105804, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012108908267691732, 'train/video_loss': 0.20606912672519684, 'train/total_loss': 0.36480534076690674}
16306
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
16307
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16308
+ tensor(0.0074, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16309
+ [Rank 3] Trainer log: {'loss': 0.2282, 'grad_norm': 2.0619752407073975, 'learning_rate': 2.509109290893541e-09}[Rank 2] Trainer log: {'loss': 0.2282, 'grad_norm': 2.0619752407073975, 'learning_rate': 2.509109290893541e-09}
16310
+
16311
+ [Rank 1] Trainer log: {'loss': 0.2282, 'grad_norm': 2.0619752407073975, 'learning_rate': 2.509109290893541e-09}
16312
+ [Rank 0] Trainer log: {'loss': 0.2282, 'grad_norm': 2.0619752407073975, 'learning_rate': 2.509109290893541e-09}
16313
+ {'loss': 0.2282, 'grad_norm': 2.0619752407073975, 'learning_rate': 2.509109290893541e-09, 'epoch': 0.99}
16314
+ tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
16315
+ tensor(0.4648, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16316
+ tensor(-0.0008, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:3', grad_fn=<MulBackward0>)
16317
+ tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
16318
+ {'train/tv_loss': None, 'train/lm_loss': 0.31027505397796634, 'train/info_loss': 0.33556777238845825, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012801478151232005, 'train/video_loss': 0.3354397714138031, 'train/total_loss': 0.6457148194313049}
16319
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
16320
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
16321
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16322
+ {'train/tv_loss': 0.0001363527961075306, 'train/lm_loss': 2.822676906362176e-05, 'train/info_loss': 1.710624019324314e-05, 'train/ref_loss': 0.13075466454029083, 'train/uncertainty_loss': -6.930269300937652e-05, 'train/video_loss': 0.1317932903766632, 'train/total_loss': 0.13182151317596436}
16323
+ tensor(0.0040, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16324
+ [Rank 2] Trainer log: {'loss': 0.4774, 'grad_norm': 2.6274688243865967, 'learning_rate': 2.2758449401638628e-09}
16325
+ [Rank 0] Trainer log: {'loss': 0.4774, 'grad_norm': 2.6274688243865967, 'learning_rate': 2.2758449401638628e-09}[Rank 3] Trainer log: {'loss': 0.4774, 'grad_norm': 2.6274688243865967, 'learning_rate': 2.2758449401638628e-09}
16326
+
16327
+ {'loss': 0.4774, 'grad_norm': 2.6274688243865967, 'learning_rate': 2.2758449401638628e-09, 'epoch': 0.99}
16328
+ [Rank 1] Trainer log: {'loss': 0.4774, 'grad_norm': 2.6274688243865967, 'learning_rate': 2.2758449401638628e-09}
16329
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
16330
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
16331
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
16332
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
16333
+ {'train/tv_loss': None, 'train/lm_loss': 0.30461447238922124, 'train/info_loss': 0.1613609790802002, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011446572607383133, 'train/video_loss': 0.16124650835990906, 'train/total_loss': 0.4658609926700592}
16334
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
16335
+ tensor(-0.0008, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:0', grad_fn=<MulBackward0>)
16336
+ {'train/tv_loss': None, 'train/lm_loss': 0.2111635446548462, 'train/info_loss': 0.14900483191013336, 'train/ref_loss': None, 'train/uncertainty_loss': -8.154477691277862e-05, 'train/video_loss': 0.14892329275608063, 'train/total_loss': 0.3600868582725525}
16337
+ tensor(0.1748, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16338
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
16339
+ [Rank 1] Trainer log: {'loss': 0.4107, 'grad_norm': 6.459842681884766, 'learning_rate': 2.053957654871708e-09}
16340
+ [Rank 3] Trainer log: {'loss': 0.4107, 'grad_norm': 6.459842681884766, 'learning_rate': 2.053957654871708e-09}
16341
+ [Rank 0] Trainer log: {'loss': 0.4107, 'grad_norm': 6.459842681884766, 'learning_rate': 2.053957654871708e-09}[Rank 2] Trainer log: {'loss': 0.4107, 'grad_norm': 6.459842681884766, 'learning_rate': 2.053957654871708e-09}
16342
+
16343
+ {'loss': 0.4107, 'grad_norm': 6.459842681884766, 'learning_rate': 2.053957654871708e-09, 'epoch': 0.99}
16344
+ tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
16345
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16346
+ tensor(0.0091, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16347
+ tensor(0.0925, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16348
+ {'train/tv_loss': 0.00016606017015874386, 'train/lm_loss': 7.689904887229205e-05, 'train/info_loss': 2.1993630070937797e-05, 'train/ref_loss': 0.2799209952354431, 'train/uncertainty_loss': 0.009252391010522843, 'train/video_loss': 0.29052385687828064, 'train/total_loss': 0.2906007468700409}
16349
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
16350
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
16351
+ tensor(0.1732, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16352
+ tensor(0.1812, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16353
+ {'train/tv_loss': 0.00012968671508133413, 'train/lm_loss': 3.659390495158732e-05, 'train/info_loss': 2.002676046686247e-05, 'train/ref_loss': 0.3349378705024719, 'train/uncertainty_loss': 0.018121950328350067, 'train/video_loss': 0.35411736369132996, 'train/total_loss': 0.35415396094322205}
16354
+ [Rank 3] Trainer log: {'loss': 0.3274, 'grad_norm': 3.277092456817627, 'learning_rate': 1.8434476875162088e-09}
16355
+ [Rank 2] Trainer log: {'loss': 0.3274, 'grad_norm': 3.277092456817627, 'learning_rate': 1.8434476875162088e-09}
16356
+ [Rank 0] Trainer log: {'loss': 0.3274, 'grad_norm': 3.277092456817627, 'learning_rate': 1.8434476875162088e-09}[Rank 1] Trainer log: {'loss': 0.3274, 'grad_norm': 3.277092456817627, 'learning_rate': 1.8434476875162088e-09}
16357
+
16358
+ {'loss': 0.3274, 'grad_norm': 3.277092456817627, 'learning_rate': 1.8434476875162088e-09, 'epoch': 0.99}
16359
+ tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
16360
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(0.0697, device='cuda:0', grad_fn=<AddBackward0>)tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16361
+ tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16362
+ {'train/tv_loss': 0.00016026009107008579, 'train/lm_loss': 1.6855483409017324e-05, 'train/info_loss': 1.4424115761357825e-05, 'train/ref_loss': 0.26810598373413086, 'train/uncertainty_loss': 0.006967854499816895, 'train/video_loss': 0.2763703167438507, 'train/total_loss': 0.27638718485832214}
16363
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16364
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
16365
+ tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
16366
+ {'train/tv_loss': None, 'train/lm_loss': 0.10554903745651245, 'train/info_loss': 0.11926790326833725, 'train/ref_loss': None, 'train/uncertainty_loss': -9.559270110912622e-05, 'train/video_loss': 0.11917231231927872, 'train/total_loss': 0.22472134232521057}
16367
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16368
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16369
+ [Rank 3] Trainer log: {'loss': 0.2043, 'grad_norm': 7.04038667678833, 'learning_rate': 1.6443152776524085e-09}[Rank 1] Trainer log: {'loss': 0.2043, 'grad_norm': 7.04038667678833, 'learning_rate': 1.6443152776524085e-09}[Rank 0] Trainer log: {'loss': 0.2043, 'grad_norm': 7.04038667678833, 'learning_rate': 1.6443152776524085e-09}
16370
+
16371
+ [Rank 2] Trainer log: {'loss': 0.2043, 'grad_norm': 7.04038667678833, 'learning_rate': 1.6443152776524085e-09}
16372
+
16373
+ {'loss': 0.2043, 'grad_norm': 7.04038667678833, 'learning_rate': 1.6443152776524085e-09, 'epoch': 0.99}
16374
+ tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
16375
+ {'train/tv_loss': None, 'train/lm_loss': 0.10073459148406982, 'train/info_loss': 0.14998847246170044, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012555404100567102, 'train/video_loss': 0.14986291527748108, 'train/total_loss': 0.2505975067615509}
16376
+ tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
16377
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
16378
+ tensor(0.2004, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16379
+ tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>)tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
16380
+ tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
16381
+ {'train/tv_loss': None, 'train/lm_loss': 0.39838287830352787, 'train/info_loss': 0.1479235589504242, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012364789145067335, 'train/video_loss': 0.14779990911483765, 'train/total_loss': 0.5461827516555786}
16382
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16383
+ tensor(-0.0006, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0006, device='cuda:3', grad_fn=<MulBackward0>)
16384
+ [Rank 1] Trainer log: {'loss': 0.3795, 'grad_norm': 2.248142957687378, 'learning_rate': 1.4565606518845976e-09}[Rank 2] Trainer log: {'loss': 0.3795, 'grad_norm': 2.248142957687378, 'learning_rate': 1.4565606518845976e-09}
16385
+ [Rank 3] Trainer log: {'loss': 0.3795, 'grad_norm': 2.248142957687378, 'learning_rate': 1.4565606518845976e-09}
16386
+
16387
+ [Rank 0] Trainer log: {'loss': 0.3795, 'grad_norm': 2.248142957687378, 'learning_rate': 1.4565606518845976e-09}
16388
+ {'loss': 0.3795, 'grad_norm': 2.248142957687378, 'learning_rate': 1.4565606518845976e-09, 'epoch': 0.99}
16389
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
16390
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16391
+ {'train/tv_loss': 0.00012514020781964063, 'train/lm_loss': 1.3207952724769713e-05, 'train/info_loss': 1.2516818969743326e-05, 'train/ref_loss': 0.19032391905784607, 'train/uncertainty_loss': -6.848637713119388e-05, 'train/video_loss': 0.19126906991004944, 'train/total_loss': 0.1912822723388672}
16392
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16393
+ tensor(0.2011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16394
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
16395
+ {'train/tv_loss': None, 'train/lm_loss': 0.21502296924591066, 'train/info_loss': 0.1758623719215393, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011166655458509922, 'train/video_loss': 0.1757507026195526, 'train/total_loss': 0.3907736539840698}
16396
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
16397
+ tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
16398
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
16399
+ [Rank 2] Trainer log: {'loss': 0.3061, 'grad_norm': 9.498651504516602, 'learning_rate': 1.2801840238707565e-09}[Rank 0] Trainer log: {'loss': 0.3061, 'grad_norm': 9.498651504516602, 'learning_rate': 1.2801840238707565e-09}[Rank 3] Trainer log: {'loss': 0.3061, 'grad_norm': 9.498651504516602, 'learning_rate': 1.2801840238707565e-09}
16400
+
16401
+
16402
+ [Rank 1] Trainer log: {'loss': 0.3061, 'grad_norm': 9.498651504516602, 'learning_rate': 1.2801840238707565e-09}
16403
+ {'loss': 0.3061, 'grad_norm': 9.498651504516602, 'learning_rate': 1.2801840238707565e-09, 'epoch': 1.0}
16404
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
16405
+ tensor(0.0269, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16406
+ {'train/tv_loss': 0.00012117947917431593, 'train/lm_loss': 2.856050559785217e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.23599904775619507, 'train/uncertainty_loss': 0.0026872064918279648, 'train/video_loss': 0.23967339098453522, 'train/total_loss': 0.23970195651054382}
16407
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16408
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16409
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
16410
+ {'train/tv_loss': None, 'train/lm_loss': 0.2651496648788452, 'train/info_loss': 0.16941337287425995, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012172098504379392, 'train/video_loss': 0.1692916452884674, 'train/total_loss': 0.43444132804870605}
16411
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
16412
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16413
+ tensor(0.0431, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16414
+ [Rank 2] Trainer log: {'loss': 0.2937, 'grad_norm': 1.8806184530258179, 'learning_rate': 1.1151855943225543e-09}[Rank 3] Trainer log: {'loss': 0.2937, 'grad_norm': 1.8806184530258179, 'learning_rate': 1.1151855943225543e-09}[Rank 0] Trainer log: {'loss': 0.2937, 'grad_norm': 1.8806184530258179, 'learning_rate': 1.1151855943225543e-09}
16415
+
16416
+
16417
+ [Rank 1] Trainer log: {'loss': 0.2937, 'grad_norm': 1.8806184530258179, 'learning_rate': 1.1151855943225543e-09}
16418
+ {'loss': 0.2937, 'grad_norm': 1.8806184530258179, 'learning_rate': 1.1151855943225543e-09, 'epoch': 1.0}
16419
+ tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
16420
+ tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
16421
+ tensor(0.1725, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16422
+ tensor(-0.0008, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:0', grad_fn=<MulBackward0>)
16423
+ {'train/tv_loss': 0.00032057708594948056, 'train/lm_loss': 1.3088752166368068e-05, 'train/info_loss': 1.3112849956087302e-05, 'train/ref_loss': 0.06266696751117706, 'train/uncertainty_loss': -7.520327344536782e-05, 'train/video_loss': 0.06516949087381363, 'train/total_loss': 0.06518258154392242}
16424
+ tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
16425
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
16426
+ {'train/tv_loss': None, 'train/lm_loss': 0.35767147541046146, 'train/info_loss': 0.1781483143568039, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011188344797119498, 'train/video_loss': 0.17803643643856049, 'train/total_loss': 0.5357078909873962}
16427
+ tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
16428
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16429
+ [Rank 2] Trainer log: {'loss': 0.3343, 'grad_norm': 7.238387107849121, 'learning_rate': 9.615655510020193e-10}[Rank 3] Trainer log: {'loss': 0.3343, 'grad_norm': 7.238387107849121, 'learning_rate': 9.615655510020193e-10}[Rank 0] Trainer log: {'loss': 0.3343, 'grad_norm': 7.238387107849121, 'learning_rate': 9.615655510020193e-10}
16430
+
16431
+
16432
+ [Rank 1] Trainer log: {'loss': 0.3343, 'grad_norm': 7.238387107849121, 'learning_rate': 9.615655510020193e-10}
16433
+ {'loss': 0.3343, 'grad_norm': 7.238387107849121, 'learning_rate': 9.615655510020193e-10, 'epoch': 1.0}
16434
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
16435
+ tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
16436
+ tensor(0.1212, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16437
+ {'train/tv_loss': 0.00014093549689278006, 'train/lm_loss': 1.683164300629869e-05, 'train/info_loss': 1.4185704458213877e-05, 'train/ref_loss': 0.2974424362182617, 'train/uncertainty_loss': 0.012122622132301331, 'train/video_loss': 0.3107067346572876, 'train/total_loss': 0.31072357296943665}
16438
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
16439
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
16440
+ tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
16441
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16442
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
16443
+ {'train/tv_loss': None, 'train/lm_loss': 0.05586314797401429, 'train/info_loss': 0.11093997955322266, 'train/ref_loss': None, 'train/uncertainty_loss': -8.851074380800128e-05, 'train/video_loss': 0.1108514666557312, 'train/total_loss': 0.166714608669281}
16444
+ [Rank 0] Trainer log: {'loss': 0.3126, 'grad_norm': 2.235849380493164, 'learning_rate': 8.193240687226489e-10}[Rank 1] Trainer log: {'loss': 0.3126, 'grad_norm': 2.235849380493164, 'learning_rate': 8.193240687226489e-10}[Rank 3] Trainer log: {'loss': 0.3126, 'grad_norm': 2.235849380493164, 'learning_rate': 8.193240687226489e-10}
16445
+
16446
+
16447
+ [Rank 2] Trainer log: {'loss': 0.3126, 'grad_norm': 2.235849380493164, 'learning_rate': 8.193240687226489e-10}
16448
+ {'loss': 0.3126, 'grad_norm': 2.235849380493164, 'learning_rate': 8.193240687226489e-10, 'epoch': 1.0}
16449
+ tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
16450
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
16451
+ tensor(0.1352, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16452
+ {'train/tv_loss': 0.00017385379178449514, 'train/lm_loss': 6.800925475545228e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.30475878715515137, 'train/uncertainty_loss': 0.013520647585391999, 'train/video_loss': 0.3196900188922882, 'train/total_loss': 0.31975802779197693}
16453
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16454
+ tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
16455
+ tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
16456
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16457
+ {'train/tv_loss': 0.00015056305564939976, 'train/lm_loss': 2.4770156596787277e-05, 'train/info_loss': 1.7702266632113606e-05, 'train/ref_loss': 0.18287095427513123, 'train/uncertainty_loss': -7.037441246211529e-05, 'train/video_loss': 0.18402278423309326, 'train/total_loss': 0.18404754996299744}
16458
+ tensor(0.1576, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16459
+ [Rank 3] Trainer log: {'loss': 0.2891, 'grad_norm': 1.8388005495071411, 'learning_rate': 6.88461309351629e-10}
16460
+ [Rank 0] Trainer log: {'loss': 0.2891, 'grad_norm': 1.8388005495071411, 'learning_rate': 6.88461309351629e-10}[Rank 1] Trainer log: {'loss': 0.2891, 'grad_norm': 1.8388005495071411, 'learning_rate': 6.88461309351629e-10}
16461
+
16462
+ [Rank 2] Trainer log: {'loss': 0.2891, 'grad_norm': 1.8388005495071411, 'learning_rate': 6.88461309351629e-10}
16463
+ {'loss': 0.2891, 'grad_norm': 1.8388005495071411, 'learning_rate': 6.88461309351629e-10, 'epoch': 1.0}
16464
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
16465
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16466
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16467
+ tensor(0.0547, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16468
+ {'train/tv_loss': 9.430212085135282e-05, 'train/lm_loss': 1.3041071360930802e-05, 'train/info_loss': 1.293404056923464e-05, 'train/ref_loss': 0.24797077476978302, 'train/uncertainty_loss': 0.005471675470471383, 'train/video_loss': 0.2542097866535187, 'train/total_loss': 0.2542228400707245}
16469
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
16470
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
16471
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16472
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16473
+ {'train/tv_loss': 0.0001686649862676859, 'train/lm_loss': 1.6736284305807202e-05, 'train/info_loss': 1.4662527973996475e-05, 'train/ref_loss': 0.16706281900405884, 'train/uncertainty_loss': -7.104419637471437e-05, 'train/video_loss': 0.16835574805736542, 'train/total_loss': 0.1683724820613861}
16474
+ [Rank 3] Trainer log: {'loss': 0.2849, 'grad_norm': 2.7503747940063477, 'learning_rate': 5.689774218065047e-10}
16475
+ [Rank 0] Trainer log: {'loss': 0.2849, 'grad_norm': 2.7503747940063477, 'learning_rate': 5.689774218065047e-10}[Rank 1] Trainer log: {'loss': 0.2849, 'grad_norm': 2.7503747940063477, 'learning_rate': 5.689774218065047e-10}
16476
+
16477
+ [Rank 2] Trainer log: {'loss': 0.2849, 'grad_norm': 2.7503747940063477, 'learning_rate': 5.689774218065047e-10}
16478
+ {'loss': 0.2849, 'grad_norm': 2.7503747940063477, 'learning_rate': 5.689774218065047e-10, 'epoch': 1.0}
16479
+ tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
16480
+ tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
16481
+ {'train/tv_loss': None, 'train/lm_loss': 0.2963602066040039, 'train/info_loss': 0.15512116253376007, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010410062968730928, 'train/video_loss': 0.15501706302165985, 'train/total_loss': 0.451377272605896}
16482
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16483
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16484
+ tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
16485
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16486
+ {'train/tv_loss': 0.00016122745582833887, 'train/lm_loss': 1.504364627180621e-05, 'train/info_loss': 1.3768483768217266e-05, 'train/ref_loss': 0.12243026494979858, 'train/uncertainty_loss': -7.243495201691985e-05, 'train/video_loss': 0.12366142123937607, 'train/total_loss': 0.12367646396160126}
16487
+ tensor(0.0812, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16488
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
16489
+ [Rank 0] Trainer log: {'loss': 0.3402, 'grad_norm': 4.494094371795654, 'learning_rate': 4.608725420540694e-10}[Rank 1] Trainer log: {'loss': 0.3402, 'grad_norm': 4.494094371795654, 'learning_rate': 4.608725420540694e-10}
16490
+ [Rank 2] Trainer log: {'loss': 0.3402, 'grad_norm': 4.494094371795654, 'learning_rate': 4.608725420540694e-10}
16491
+
16492
+ [Rank 3] Trainer log: {'loss': 0.3402, 'grad_norm': 4.494094371795654, 'learning_rate': 4.608725420540694e-10}
16493
+ {'loss': 0.3402, 'grad_norm': 4.494094371795654, 'learning_rate': 4.608725420540694e-10, 'epoch': 1.0}
16494
+ tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
16495
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
16496
+ {'train/tv_loss': None, 'train/lm_loss': 0.035262671113014225, 'train/info_loss': 0.207004576921463, 'train/ref_loss': None, 'train/uncertainty_loss': -8.50230921059847e-05, 'train/video_loss': 0.20691955089569092, 'train/total_loss': 0.24218222498893738}
16497
+ tensor(0.1316, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16498
+ tensor(0.0744, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16499
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
16500
+ {'train/tv_loss': None, 'train/lm_loss': 0.08468132019042969, 'train/info_loss': 0.13620348274707794, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001168315066024661, 'train/video_loss': 0.13608665764331818, 'train/total_loss': 0.22076797485351562}
16501
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
16502
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
16503
+ tensor(-0.0008, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:3', grad_fn=<MulBackward0>)
16504
+ [Rank 2] Trainer log: {'loss': 0.3431, 'grad_norm': 8.961502075195312, 'learning_rate': 3.6414679311591595e-10}[Rank 3] Trainer log: {'loss': 0.3431, 'grad_norm': 8.961502075195312, 'learning_rate': 3.6414679311591595e-10}
16505
+
16506
+ [Rank 1] Trainer log: {'loss': 0.3431, 'grad_norm': 8.961502075195312, 'learning_rate': 3.6414679311591595e-10}
16507
+ [Rank 0] Trainer log: {'loss': 0.3431, 'grad_norm': 8.961502075195312, 'learning_rate': 3.6414679311591595e-10}
16508
+ {'loss': 0.3431, 'grad_norm': 8.961502075195312, 'learning_rate': 3.6414679311591595e-10, 'epoch': 1.0}
16509
+ tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
16510
+ tensor(0.1708, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16511
+ tensor(0.1118, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16512
+ {'train/tv_loss': 0.0001698122243396938, 'train/lm_loss': 1.723692112136632e-05, 'train/info_loss': 1.293404056923464e-05, 'train/ref_loss': 0.2866149842739105, 'train/uncertainty_loss': 0.01117597669363022, 'train/video_loss': 0.2991624176502228, 'train/total_loss': 0.29917964339256287}
16513
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16514
+ tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
16515
+ {'train/tv_loss': None, 'train/lm_loss': 0.3133418560028076, 'train/info_loss': 0.17231473326683044, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012505786726251246, 'train/video_loss': 0.17218968272209167, 'train/total_loss': 0.4855315387248993}
16516
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
16517
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
16518
+ tensor(0.0181, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16519
+ [Rank 2] Trainer log: {'loss': 0.3475, 'grad_norm': 2.2179486751556396, 'learning_rate': 2.7880028506066526e-10}
16520
+ [Rank 3] Trainer log: {'loss': 0.3475, 'grad_norm': 2.2179486751556396, 'learning_rate': 2.7880028506066526e-10}
16521
+ [Rank 1] Trainer log: {'loss': 0.3475, 'grad_norm': 2.2179486751556396, 'learning_rate': 2.7880028506066526e-10}
16522
+ [Rank 0] Trainer log: {'loss': 0.3475, 'grad_norm': 2.2179486751556396, 'learning_rate': 2.7880028506066526e-10}
16523
+ {'loss': 0.3475, 'grad_norm': 2.2179486751556396, 'learning_rate': 2.7880028506066526e-10, 'epoch': 1.0}
16524
+ tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
16525
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16526
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16527
+ tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
16528
+ {'train/tv_loss': None, 'train/lm_loss': 0.3884145736694336, 'train/info_loss': 0.23487679660320282, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012862115399912, 'train/video_loss': 0.23474816977977753, 'train/total_loss': 0.6231627464294434}
16529
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
16530
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16531
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
16532
+ {'train/tv_loss': None, 'train/lm_loss': 0.010719782114028931, 'train/info_loss': 0.19857865571975708, 'train/ref_loss': None, 'train/uncertainty_loss': -8.820317452773452e-05, 'train/video_loss': 0.1984904557466507, 'train/total_loss': 0.20921023190021515}
16533
+ tensor(0.0573, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16534
+ [Rank 3] Trainer log: {'loss': 0.2975, 'grad_norm': 3.0862128734588623, 'learning_rate': 2.0483311501062751e-10}
16535
+ [Rank 1] Trainer log: {'loss': 0.2975, 'grad_norm': 3.0862128734588623, 'learning_rate': 2.0483311501062751e-10}[Rank 0] Trainer log: {'loss': 0.2975, 'grad_norm': 3.0862128734588623, 'learning_rate': 2.0483311501062751e-10}
16536
+ [Rank 2] Trainer log: {'loss': 0.2975, 'grad_norm': 3.0862128734588623, 'learning_rate': 2.0483311501062751e-10}
16537
+
16538
+ {'loss': 0.2975, 'grad_norm': 3.0862128734588623, 'learning_rate': 2.0483311501062751e-10, 'epoch': 1.0}
16539
+ tensor(-0.0008, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:1', grad_fn=<MulBackward0>)
16540
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16541
+ {'train/tv_loss': 0.0002269430086016655, 'train/lm_loss': 1.3041071360930802e-05, 'train/info_loss': 1.3768483768217266e-05, 'train/ref_loss': 0.15769097208976746, 'train/uncertainty_loss': -7.11314962245524e-05, 'train/video_loss': 0.15944914519786835, 'train/total_loss': 0.15946218371391296}
16542
+ tensor(0.1142, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16543
+ tensor(-0.0008, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:3', grad_fn=<MulBackward0>)
16544
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
16545
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
16546
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16547
+ {'train/tv_loss': 0.00013790428638458252, 'train/lm_loss': 2.784535172395408e-05, 'train/info_loss': 1.6867830709088594e-05, 'train/ref_loss': 0.2233009785413742, 'train/uncertainty_loss': -7.168206502683461e-05, 'train/video_loss': 0.22434939444065094, 'train/total_loss': 0.22437724471092224}
16548
+ tensor(0.1906, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16549
+ [Rank 3] Trainer log: {'loss': 0.2824, 'grad_norm': 2.1807076930999756, 'learning_rate': 1.4224536713847158e-10}[Rank 0] Trainer log: {'loss': 0.2824, 'grad_norm': 2.1807076930999756, 'learning_rate': 1.4224536713847158e-10}[Rank 2] Trainer log: {'loss': 0.2824, 'grad_norm': 2.1807076930999756, 'learning_rate': 1.4224536713847158e-10}
16550
+
16551
+ [Rank 1] Trainer log: {'loss': 0.2824, 'grad_norm': 2.1807076930999756, 'learning_rate': 1.4224536713847158e-10}
16552
+
16553
+ {'loss': 0.2824, 'grad_norm': 2.1807076930999756, 'learning_rate': 1.4224536713847158e-10, 'epoch': 1.0}
16554
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
16555
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
16556
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
16557
+ {'train/tv_loss': None, 'train/lm_loss': 0.1099284052848816, 'train/info_loss': 0.18190373480319977, 'train/ref_loss': None, 'train/uncertainty_loss': -9.252233430743218e-05, 'train/video_loss': 0.18181121349334717, 'train/total_loss': 0.2917396128177643}
16558
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16559
+ tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
16560
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
16561
+ tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
16562
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
16563
+ {'train/tv_loss': None, 'train/lm_loss': 0.2040557146072388, 'train/info_loss': 0.19522923231124878, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010584730189293623, 'train/video_loss': 0.19512338936328888, 'train/total_loss': 0.3991791009902954}
16564
+ [Rank 1] Trainer log: {'loss': 0.4, 'grad_norm': 2.9970059394836426, 'learning_rate': 9.103711266611471e-11}
16565
+ [Rank 0] Trainer log: {'loss': 0.4, 'grad_norm': 2.9970059394836426, 'learning_rate': 9.103711266611471e-11}[Rank 3] Trainer log: {'loss': 0.4, 'grad_norm': 2.9970059394836426, 'learning_rate': 9.103711266611471e-11}
16566
+ [Rank 2] Trainer log: {'loss': 0.4, 'grad_norm': 2.9970059394836426, 'learning_rate': 9.103711266611471e-11}
16567
+
16568
+ {'loss': 0.4, 'grad_norm': 2.9970059394836426, 'learning_rate': 9.103711266611471e-11, 'epoch': 1.0}
16569
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
16570
+ tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
16571
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
16572
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16573
+ {'train/tv_loss': 0.00012163397623226047, 'train/lm_loss': 2.4770156596787277e-05, 'train/info_loss': 1.6629419405944645e-05, 'train/ref_loss': 0.0847846046090126, 'train/uncertainty_loss': -7.099361391738057e-05, 'train/video_loss': 0.0857033059000969, 'train/total_loss': 0.08572807908058167}
16574
+ tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
16575
+ {'train/tv_loss': None, 'train/lm_loss': 0.30223650932312013, 'train/info_loss': 0.21647511422634125, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013338448479771614, 'train/video_loss': 0.21634173393249512, 'train/total_loss': 0.518578290939331}
16576
+ tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
16577
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16578
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16579
+ [Rank 3] Trainer log: {'loss': 0.3471, 'grad_norm': 2.964756965637207, 'learning_rate': 5.12084098680532e-11}[Rank 0] Trainer log: {'loss': 0.3471, 'grad_norm': 2.964756965637207, 'learning_rate': 5.12084098680532e-11}[Rank 2] Trainer log: {'loss': 0.3471, 'grad_norm': 2.964756965637207, 'learning_rate': 5.12084098680532e-11}
16580
+
16581
+
16582
+ {'loss': 0.3471, 'grad_norm': 2.964756965637207, 'learning_rate': 5.12084098680532e-11, 'epoch': 1.0}
16583
+ [Rank 1] Trainer log: {'loss': 0.3471, 'grad_norm': 2.964756965637207, 'learning_rate': 5.12084098680532e-11}
16584
+ tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
16585
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
16586
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16587
+ {'train/tv_loss': 0.00012253506574779748, 'train/lm_loss': 1.4829085557721555e-05, 'train/info_loss': 1.2755231182381976e-05, 'train/ref_loss': 0.06293053925037384, 'train/uncertainty_loss': -6.927629001438618e-05, 'train/video_loss': 0.06385429948568344, 'train/total_loss': 0.06386912614107132}
16588
+ tensor(0.3175, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16589
+ tensor(0.2061, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16590
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16591
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
16592
+ {'train/tv_loss': 0.00018769102171063424, 'train/lm_loss': 3.144493966829032e-05, 'train/info_loss': 1.710624019324314e-05, 'train/ref_loss': 0.18825295567512512, 'train/uncertainty_loss': -6.965706706978381e-05, 'train/video_loss': 0.18970192968845367, 'train/total_loss': 0.1897333711385727}
16593
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
16594
+ [Rank 1] Trainer log: {'loss': 0.2313, 'grad_norm': 3.738187551498413, 'learning_rate': 2.2759304065811394e-11}[Rank 0] Trainer log: {'loss': 0.2313, 'grad_norm': 3.738187551498413, 'learning_rate': 2.2759304065811394e-11}[Rank 2] Trainer log: {'loss': 0.2313, 'grad_norm': 3.738187551498413, 'learning_rate': 2.2759304065811394e-11}
16595
+
16596
+ [Rank 3] Trainer log: {'loss': 0.2313, 'grad_norm': 3.738187551498413, 'learning_rate': 2.2759304065811394e-11}
16597
+
16598
+ {'loss': 0.2313, 'grad_norm': 3.738187551498413, 'learning_rate': 2.2759304065811394e-11, 'epoch': 1.0}
16599
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
16600
+ {'train/tv_loss': None, 'train/lm_loss': 0.08768467903137207, 'train/info_loss': 0.19097542762756348, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011287924135103822, 'train/video_loss': 0.19086255133152008, 'train/total_loss': 0.2785472273826599}
16601
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
16602
+ tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
16603
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
16604
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
16605
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
16606
+ {'train/tv_loss': None, 'train/lm_loss': 0.3417798757553101, 'train/info_loss': 0.12373294681310654, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011930712498724461, 'train/video_loss': 0.123613640666008, 'train/total_loss': 0.4653935432434082}
16607
+ tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
16608
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
16609
+ [Rank 1] Trainer log: {'loss': 0.3386, 'grad_norm': 3.233135223388672, 'learning_rate': 5.6898276357131296e-12}[Rank 2] Trainer log: {'loss': 0.3386, 'grad_norm': 3.233135223388672, 'learning_rate': 5.6898276357131296e-12}[Rank 3] Trainer log: {'loss': 0.3386, 'grad_norm': 3.233135223388672, 'learning_rate': 5.6898276357131296e-12}
16610
+
16611
+
16612
+ [Rank 0] Trainer log: {'loss': 0.3386, 'grad_norm': 3.233135223388672, 'learning_rate': 5.6898276357131296e-12}
16613
+ {'loss': 0.3386, 'grad_norm': 3.233135223388672, 'learning_rate': 5.6898276357131296e-12, 'epoch': 1.0}
16614
+ tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:0', grad_fn=<MulBackward0>)
16615
+ {'train/tv_loss': None, 'train/lm_loss': 0.22061138153076174, 'train/info_loss': 0.19586126506328583, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013994580367580057, 'train/video_loss': 0.1957213133573532, 'train/total_loss': 0.4163326919078827}
16616
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
16617
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
16618
+ tensor(0.3543, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
16619
+ tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
16620
+ tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
16621
+ {'train/tv_loss': None, 'train/lm_loss': 0.5868082523345948, 'train/info_loss': 0.14800412952899933, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013187677832320334, 'train/video_loss': 0.14787225425243378, 'train/total_loss': 0.7346805334091187}
16622
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
16623
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
16624
+ [Rank 1] Trainer log: {'loss': 0.4673, 'grad_norm': 8.241683959960938, 'learning_rate': 0.0}[Rank 2] Trainer log: {'loss': 0.4673, 'grad_norm': 8.241683959960938, 'learning_rate': 0.0}[Rank 3] Trainer log: {'loss': 0.4673, 'grad_norm': 8.241683959960938, 'learning_rate': 0.0}
16625
+
16626
+
16627
+ [Rank 0] Trainer log: {'loss': 0.4673, 'grad_norm': 8.241683959960938, 'learning_rate': 0.0}
16628
+ {'loss': 0.4673, 'grad_norm': 8.241683959960938, 'learning_rate': 0.0, 'epoch': 1.0}
16629
+ [Rank 1] Trainer log: {'train_runtime': 29823.9547, 'train_samples_per_second': 0.832, 'train_steps_per_second': 0.104, 'total_flos': 4.969273863004226e+18, 'train_loss': 0.11426909348176371}[Rank 2] Trainer log: {'train_runtime': 29822.0432, 'train_samples_per_second': 0.832, 'train_steps_per_second': 0.104, 'total_flos': 4.969273863004226e+18, 'train_loss': 0.11426909348176371}[Rank 3] Trainer log: {'train_runtime': 29817.5825, 'train_samples_per_second': 0.832, 'train_steps_per_second': 0.104, 'total_flos': 4.969273863004226e+18, 'train_loss': 0.11426909348176371}
16630
+
16631
+
16632
+ [Rank 0] Trainer log: {'train_runtime': 29828.2573, 'train_samples_per_second': 0.832, 'train_steps_per_second': 0.104, 'total_flos': 4.969273863004226e+18, 'train_loss': 0.11426909348176371}
16633
+ {'train_runtime': 29828.2573, 'train_samples_per_second': 0.832, 'train_steps_per_second': 0.104, 'train_loss': 0.11426909348176371, 'epoch': 1.0}
16634
+ Finished TrainingFinished Training
16635
+ Finished Training
16636
+