aiden200 commited on
Commit
b4a2b50
·
verified ·
1 Parent(s): e1a1300

Training in progress, step 2075

Browse files
Files changed (2) hide show
  1. adapter_model.safetensors +1 -1
  2. train.log +360 -0
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29a81378ab1f78803e81097b916a079f8e851fd7ab413fde0729ae1de1e9207a
3
  size 1204780872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b24c739c235e4e9845a17684af3e9ae65224f6f9ce24eac3682df1571e15c4f5
3
  size 1204780872
train.log CHANGED
@@ -17413,3 +17413,363 @@ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device=
17413
  tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17414
  tensor(0.4664, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17415
  {'train/tv_loss': 0.00020973044447600842, 'train/lm_loss': 4.715363611467183e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.5165361166000366, 'train/uncertainty_loss': 0.0466405063867569, 'train/video_loss': 0.5648768544197083, 'train/total_loss': 0.5649240016937256}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17413
  tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17414
  tensor(0.4664, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17415
  {'train/tv_loss': 0.00020973044447600842, 'train/lm_loss': 4.715363611467183e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.5165361166000366, 'train/uncertainty_loss': 0.0466405063867569, 'train/video_loss': 0.5648768544197083, 'train/total_loss': 0.5649240016937256}
17416
+ tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
17417
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
17418
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
17419
+ {'train/tv_loss': None, 'train/lm_loss': 0.36234724521636963, 'train/info_loss': 0.14337365329265594, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011461186222732067, 'train/video_loss': 0.14325904846191406, 'train/total_loss': 0.5056062936782837}
17420
+ tensor(0.0681, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17421
+ [Rank 3] Trainer log: {'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06}[Rank 0] Trainer log: {'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06}[Rank 2] Trainer log: {'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06}
17422
+
17423
+
17424
+ [Rank 1] Trainer log: {'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06}
17425
+ {'loss': 0.3913, 'grad_norm': 3.6058335304260254, 'learning_rate': 5.615223237580377e-06, 'epoch': 0.66}
17426
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
17427
+ tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
17428
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
17429
+ {'train/tv_loss': None, 'train/lm_loss': 0.06900591254234315, 'train/info_loss': 0.15682968497276306, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010683162836357952, 'train/video_loss': 0.15672285854816437, 'train/total_loss': 0.22572878003120422}
17430
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
17431
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
17432
+ {'train/tv_loss': None, 'train/lm_loss': 0.40285620689392093, 'train/info_loss': 0.18601451814174652, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011755496961995959, 'train/video_loss': 0.18589696288108826, 'train/total_loss': 0.5887531638145447}
17433
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
17434
+ tensor(0.0370, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17435
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17436
+ [Rank 1] Trainer log: {'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06}
17437
+ [Rank 0] Trainer log: {'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06}[Rank 3] Trainer log: {'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06}
17438
+
17439
+ [Rank 2] Trainer log: {'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06}
17440
+ {'loss': 0.3731, 'grad_norm': 2.03836727142334, 'learning_rate': 5.605638360685988e-06, 'epoch': 0.66}
17441
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
17442
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
17443
+ tensor(0.2234, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17444
+ {'train/tv_loss': 0.00023927739821374419, 'train/lm_loss': 2.8107574325986207e-05, 'train/info_loss': 1.8238688426208682e-05, 'train/ref_loss': 0.3630368113517761, 'train/uncertainty_loss': 0.022341114282608033, 'train/video_loss': 0.3873103857040405, 'train/total_loss': 0.3873384892940521}
17445
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
17446
+ tensor(-0.0013, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:1', grad_fn=<MulBackward0>)
17447
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
17448
+ tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
17449
+ {'train/tv_loss': None, 'train/lm_loss': 0.14715486764907837, 'train/info_loss': 0.13828474283218384, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001031541614793241, 'train/video_loss': 0.13818158209323883, 'train/total_loss': 0.285336434841156}
17450
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17451
+ [Rank 0] Trainer log: {'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06}[Rank 1] Trainer log: {'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06}
17452
+ [Rank 2] Trainer log: {'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06}
17453
+
17454
+ [Rank 3] Trainer log: {'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06}
17455
+ {'loss': 0.3744, 'grad_norm': 5.042629241943359, 'learning_rate': 5.5960584844236565e-06, 'epoch': 0.66}
17456
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
17457
+ {'train/tv_loss': None, 'train/lm_loss': 0.4195257663726807, 'train/info_loss': 0.10877030342817307, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011566100874915719, 'train/video_loss': 0.10865464061498642, 'train/total_loss': 0.5281804203987122}
17458
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17459
+ tensor(0.1955, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17460
+ tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
17461
+ tensor(-0.0016, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0016, device='cuda:0', grad_fn=<MulBackward0>)
17462
+ {'train/tv_loss': None, 'train/lm_loss': 2.8298282995820048e-05, 'train/info_loss': 2.342404332011938e-05, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00015964442864060404, 'train/video_loss': -0.00013622039114125073, 'train/total_loss': -0.00010792211105581373}
17463
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
17464
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17465
+ tensor(0.0522, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17466
+ [Rank 3] Trainer log: {'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06}
17467
+ [Rank 0] Trainer log: {'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06}[Rank 1] Trainer log: {'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06}
17468
+
17469
+ [Rank 2] Trainer log: {'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06}
17470
+ {'loss': 0.2652, 'grad_norm': 10.820907592773438, 'learning_rate': 5.586483619694953e-06, 'epoch': 0.66}
17471
+ tensor(-0.0016, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0016, device='cuda:3', grad_fn=<MulBackward0>)
17472
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
17473
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17474
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17475
+ {'train/tv_loss': 0.00031795650720596315, 'train/lm_loss': 7.840050384402276e-05, 'train/info_loss': 2.413929905742407e-05, 'train/ref_loss': 0.1724846065044403, 'train/uncertainty_loss': -6.841651047579944e-05, 'train/video_loss': 0.17498399317264557, 'train/total_loss': 0.1750623881816864}
17476
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
17477
+ {'train/tv_loss': None, 'train/lm_loss': 0.19196442365646363, 'train/info_loss': 0.2579406201839447, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011937021045014263, 'train/video_loss': 0.257821261882782, 'train/total_loss': 0.4497857093811035}
17478
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
17479
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
17480
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17481
+ [Rank 2] Trainer log: {'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06}
17482
+ [Rank 3] Trainer log: {'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06}
17483
+ [Rank 0] Trainer log: {'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06}[Rank 1] Trainer log: {'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06}
17484
+
17485
+ {'loss': 0.3145, 'grad_norm': 6.420591831207275, 'learning_rate': 5.576913777395749e-06, 'epoch': 0.66}
17486
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
17487
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
17488
+ {'train/tv_loss': None, 'train/lm_loss': 0.3795618772506714, 'train/info_loss': 0.29438889026641846, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011655737180262805, 'train/video_loss': 0.2942723333835602, 'train/total_loss': 0.6738342046737671}
17489
+ tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
17490
+ tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
17491
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
17492
+ {'train/tv_loss': None, 'train/lm_loss': 0.32472651004791264, 'train/info_loss': 0.22495689988136292, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001121837878599763, 'train/video_loss': 0.22484470903873444, 'train/total_loss': 0.5495712161064148}
17493
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
17494
+ tensor(0.0239, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17495
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17496
+ [Rank 2] Trainer log: {'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06}[Rank 3] Trainer log: {'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06}
17497
+
17498
+ [Rank 0] Trainer log: {'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06}
17499
+ [Rank 1] Trainer log: {'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06}
17500
+ {'loss': 0.385, 'grad_norm': 4.281194686889648, 'learning_rate': 5.567348968416184e-06, 'epoch': 0.66}
17501
+ tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
17502
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
17503
+ tensor(0.4046, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17504
+ {'train/tv_loss': 0.00038167270831763745, 'train/lm_loss': 2.8584344545379284e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.49088212847709656, 'train/uncertainty_loss': 0.04045624732971192, 'train/video_loss': 0.5344108939170837, 'train/total_loss': 0.5344395041465759}
17505
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17506
+ tensor(-0.0015, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0015, device='cuda:3', grad_fn=<MulBackward0>)
17507
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
17508
+ {'train/tv_loss': None, 'train/lm_loss': 0.24148297309875488, 'train/info_loss': 0.2007106989622116, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011322950012981892, 'train/video_loss': 0.20059746503829956, 'train/total_loss': 0.44208043813705444}
17509
+ tensor(0.0585, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17510
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
17511
+ [Rank 0] Trainer log: {'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06}[Rank 2] Trainer log: {'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06}[Rank 1] Trainer log: {'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06}
17512
+
17513
+
17514
+ [Rank 3] Trainer log: {'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06}
17515
+ {'loss': 0.4198, 'grad_norm': 8.499199867248535, 'learning_rate': 5.557789203640687e-06, 'epoch': 0.66}
17516
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
17517
+ tensor(0.2698, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17518
+ tensor(0.4741, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17519
+ {'train/tv_loss': 0.00020899735391139985, 'train/lm_loss': 2.796454355120659e-05, 'train/info_loss': 1.794067611626815e-05, 'train/ref_loss': 0.5671656727790833, 'train/uncertainty_loss': 0.04740565419197083, 'train/video_loss': 0.6162612438201904, 'train/total_loss': 0.6162891983985901}
17520
+ tensor(0.2692, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17521
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
17522
+ tensor(0.2025, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17523
+ {'train/tv_loss': 0.00020984613802284003, 'train/lm_loss': 4.1623553261160855e-05, 'train/info_loss': 2.270885306643322e-05, 'train/ref_loss': 0.34875354170799255, 'train/uncertainty_loss': 0.020245166122913362, 'train/video_loss': 0.3707001805305481, 'train/total_loss': 0.3707418143749237}
17524
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17525
+ tensor(0.1522, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17526
+ [Rank 0] Trainer log: {'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06}[Rank 1] Trainer log: {'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06}
17527
+ [Rank 3] Trainer log: {'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06}
17528
+
17529
+ [Rank 2] Trainer log: {'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06}
17530
+ {'loss': 0.3907, 'grad_norm': 24.31747055053711, 'learning_rate': 5.548234493947939e-06, 'epoch': 0.66}
17531
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
17532
+ tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:0', grad_fn=<MulBackward0>)
17533
+ {'train/tv_loss': None, 'train/lm_loss': 0.3018145322799683, 'train/info_loss': 0.25296148657798767, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014256734866648912, 'train/video_loss': 0.25281891226768494, 'train/total_loss': 0.5546334385871887}
17534
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
17535
+ tensor(0.7656, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17536
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
17537
+ {'train/tv_loss': None, 'train/lm_loss': 0.39072275161743164, 'train/info_loss': 0.18714040517807007, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010978373466059567, 'train/video_loss': 0.187030628323555, 'train/total_loss': 0.5777533650398254}
17538
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
17539
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
17540
+ tensor(0.0862, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17541
+ [Rank 1] Trainer log: {'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06}[Rank 0] Trainer log: {'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06}[Rank 3] Trainer log: {'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06}
17542
+
17543
+ [Rank 2] Trainer log: {'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06}
17544
+
17545
+ {'loss': 0.4751, 'grad_norm': 13.349773406982422, 'learning_rate': 5.538684850210872e-06, 'epoch': 0.66}
17546
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
17547
+ tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
17548
+ {'train/tv_loss': None, 'train/lm_loss': 0.3718518972396851, 'train/info_loss': 0.2331818789243698, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00013318745186552405, 'train/video_loss': 0.2330486923456192, 'train/total_loss': 0.604900598526001}
17549
+ tensor(0.2073, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0006, device='cuda:2', grad_fn=<MulBackward0>)
17550
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17551
+ tensor(-0.0010, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:3', grad_fn=<MulBackward0>)
17552
+ tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
17553
+ tensor(0.1576, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17554
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17555
+ {'train/tv_loss': 0.00022317415568977596, 'train/lm_loss': 2.8441313770599666e-05, 'train/info_loss': 1.8834713046089746e-05, 'train/ref_loss': 0.14411097764968872, 'train/uncertainty_loss': -6.683562532998622e-05, 'train/video_loss': 0.1458483785390854, 'train/total_loss': 0.14587682485580444}
17556
+ [Rank 3] Trainer log: {'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06}[Rank 0] Trainer log: {'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06}
17557
+ [Rank 2] Trainer log: {'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06}
17558
+ [Rank 1] Trainer log: {'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06}
17559
+
17560
+ {'loss': 0.3281, 'grad_norm': 2.9163901805877686, 'learning_rate': 5.529140283296655e-06, 'epoch': 0.67}
17561
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
17562
+ tensor(-0.0013, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:3', grad_fn=<MulBackward0>)
17563
+ tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
17564
+ tensor(-0.0010, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:0', grad_fn=<MulBackward0>)
17565
+ {'train/tv_loss': None, 'train/lm_loss': 0.062081152200698854, 'train/info_loss': 0.16616980731487274, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001029652892611921, 'train/video_loss': 0.16606684029102325, 'train/total_loss': 0.22814799845218658}
17566
+ tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
17567
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17568
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17569
+ {'train/tv_loss': 0.00018424011068418622, 'train/lm_loss': 3.657006600406021e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.03961974009871483, 'train/uncertainty_loss': -6.689630681648851e-05, 'train/video_loss': 0.04104745015501976, 'train/total_loss': 0.04108402132987976}
17570
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
17571
+ [Rank 2] Trainer log: {'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06}[Rank 1] Trainer log: {'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06}
17572
+ [Rank 3] Trainer log: {'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06}
17573
+
17574
+ [Rank 0] Trainer log: {'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06}
17575
+ {'loss': 0.3907, 'grad_norm': 4.135325908660889, 'learning_rate': 5.5196008040666645e-06, 'epoch': 0.67}
17576
+ tensor(-0.0014, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:3', grad_fn=<MulBackward0>)
17577
+ tensor(-0.0013, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:2', grad_fn=<MulBackward0>)
17578
+ tensor(0.0723, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17579
+ tensor(-0.0007, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17580
+ {'train/tv_loss': 0.0002113046357408166, 'train/lm_loss': 4.093228199053556e-05, 'train/info_loss': 2.0324770957813598e-05, 'train/ref_loss': 0.0825377106666565, 'train/uncertainty_loss': -6.546297227032483e-05, 'train/video_loss': 0.08418301492929459, 'train/total_loss': 0.08422394841909409}
17581
+ tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
17582
+ tensor(0.1926, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17583
+ tensor(0.1136, device='cuda:3', grad_fn=<AddBackward0>) tensor(0.0902, device='cuda:0', grad_fn=<AddBackward0>)tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17584
+ tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17585
+ {'train/tv_loss': 0.0002980440389364958, 'train/lm_loss': 3.676076594274491e-05, 'train/info_loss': 2.0980394765501842e-05, 'train/ref_loss': 0.27036359906196594, 'train/uncertainty_loss': 0.00902046412229538, 'train/video_loss': 0.2817894220352173, 'train/total_loss': 0.2818261682987213}
17586
+ [Rank 1] Trainer log: {'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06}
17587
+ [Rank 3] Trainer log: {'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06}
17588
+ [Rank 2] Trainer log: {'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06}
17589
+ [Rank 0] Trainer log: {'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06}
17590
+ {'loss': 0.3286, 'grad_norm': 2.2409825325012207, 'learning_rate': 5.510066423376514e-06, 'epoch': 0.67}
17591
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
17592
+ {'train/tv_loss': None, 'train/lm_loss': 0.22054891586303713, 'train/info_loss': 0.16172195971012115, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012107143411412836, 'train/video_loss': 0.16160088777542114, 'train/total_loss': 0.3821498155593872}
17593
+ tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
17594
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
17595
+ tensor(0.0148, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17596
+ tensor(-0.0009, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:3', grad_fn=<MulBackward0>)
17597
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
17598
+ {'train/tv_loss': None, 'train/lm_loss': 0.2663368940353394, 'train/info_loss': 0.10841624438762665, 'train/ref_loss': None, 'train/uncertainty_loss': -8.942196145653725e-05, 'train/video_loss': 0.10832682251930237, 'train/total_loss': 0.37466371059417725}
17599
+ tensor(0.1076, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17600
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
17601
+ [Rank 1] Trainer log: {'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06}[Rank 2] Trainer log: {'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06}[Rank 3] Trainer log: {'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06}
17602
+
17603
+
17604
+ [Rank 0] Trainer log: {'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06}
17605
+ {'loss': 0.3337, 'grad_norm': 2.651632070541382, 'learning_rate': 5.500537152075987e-06, 'epoch': 0.67}
17606
+ tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
17607
+ tensor(0.2763, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17608
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17609
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
17610
+ {'train/tv_loss': None, 'train/lm_loss': 0.06459110975265503, 'train/info_loss': 0.21699465811252594, 'train/ref_loss': None, 'train/uncertainty_loss': -9.331009350717069e-05, 'train/video_loss': 0.21690134704113007, 'train/total_loss': 0.2814924716949463}
17611
+ tensor(-0.0014, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:2', grad_fn=<MulBackward0>)
17612
+ tensor(-0.0011, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:1', grad_fn=<MulBackward0>)
17613
+ tensor(-0.0015, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0015, device='cuda:3', grad_fn=<MulBackward0>)
17614
+ tensor(0.0351, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17615
+ {'train/tv_loss': 0.00018034547101706267, 'train/lm_loss': 3.2064729020930825e-05, 'train/info_loss': 1.9728748156921938e-05, 'train/ref_loss': 0.22903497517108917, 'train/uncertainty_loss': 0.003506988659501076, 'train/video_loss': 0.2340044528245926, 'train/total_loss': 0.23403652012348175}
17616
+ [Rank 1] Trainer log: {'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06}
17617
+ [Rank 0] Trainer log: {'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06}[Rank 2] Trainer log: {'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06}
17618
+
17619
+ [Rank 3] Trainer log: {'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06}
17620
+ {'loss': 0.3109, 'grad_norm': 3.874473810195923, 'learning_rate': 5.491013001009076e-06, 'epoch': 0.67}
17621
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
17622
+ {'train/tv_loss': None, 'train/lm_loss': 0.07050921320915222, 'train/info_loss': 0.19436703622341156, 'train/ref_loss': None, 'train/uncertainty_loss': -8.927494054660201e-05, 'train/video_loss': 0.19427776336669922, 'train/total_loss': 0.2647869884967804}
17623
+ tensor(0.1349, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17624
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17625
+ tensor(0.0307, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17626
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
17627
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
17628
+ tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
17629
+ {'train/tv_loss': None, 'train/lm_loss': 0.15221678018569948, 'train/info_loss': 0.10548965632915497, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011310963891446591, 'train/video_loss': 0.10537654906511307, 'train/total_loss': 0.2575933337211609}
17630
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17631
+ [Rank 1] Trainer log: {'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06}[Rank 0] Trainer log: {'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06}[Rank 3] Trainer log: {'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06}
17632
+
17633
+ [Rank 2] Trainer log: {'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06}
17634
+
17635
+ {'loss': 0.2411, 'grad_norm': 2.033538818359375, 'learning_rate': 5.4814939810139236e-06, 'epoch': 0.67}
17636
+ tensor(-0.0014, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:1', grad_fn=<MulBackward0>)
17637
+ tensor(-0.0008, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:3', grad_fn=<MulBackward0>)
17638
+ tensor(0.1315, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17639
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
17640
+ {'train/tv_loss': None, 'train/lm_loss': 0.17559853792190552, 'train/info_loss': 0.09834705293178558, 'train/ref_loss': None, 'train/uncertainty_loss': -0.0001089532976038754, 'train/video_loss': 0.09823810309171677, 'train/total_loss': 0.2738366425037384}
17641
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
17642
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
17643
+ tensor(-0.0008, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:1', grad_fn=<MulBackward0>)
17644
+ tensor(0.3571, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17645
+ {'train/tv_loss': 0.00020730055402964356, 'train/lm_loss': 3.654622996691615e-05, 'train/info_loss': 2.235124156868551e-05, 'train/ref_loss': 0.46529483795166016, 'train/uncertainty_loss': 0.035708272457122804, 'train/video_loss': 0.5026838779449463, 'train/total_loss': 0.5027204155921936}
17646
+ [Rank 2] Trainer log: {'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06}[Rank 0] Trainer log: {'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06}
17647
+ [Rank 1] Trainer log: {'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06}
17648
+
17649
+ [Rank 3] Trainer log: {'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06}
17650
+ {'loss': 0.3566, 'grad_norm': 9.683050155639648, 'learning_rate': 5.471980102922859e-06, 'epoch': 0.67}
17651
+ tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
17652
+ {'train/tv_loss': None, 'train/lm_loss': 0.1469507694244385, 'train/info_loss': 0.2718953788280487, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012867078185081483, 'train/video_loss': 0.271766722202301, 'train/total_loss': 0.41871750354766846}
17653
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17654
+ tensor(0.5532, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17655
+ tensor(0.4076, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17656
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
17657
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
17658
+ {'train/tv_loss': None, 'train/lm_loss': 0.08282127976417542, 'train/info_loss': 0.19566015899181366, 'train/ref_loss': None, 'train/uncertainty_loss': -8.814950124360622e-05, 'train/video_loss': 0.19557200372219086, 'train/total_loss': 0.2783932685852051}
17659
+ tensor(0.6230, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17660
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17661
+ [Rank 2] Trainer log: {'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06}
17662
+ [Rank 1] Trainer log: {'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06}
17663
+ [Rank 3] Trainer log: {'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06}
17664
+ [Rank 0] Trainer log: {'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06}
17665
+ {'loss': 0.3995, 'grad_norm': 9.78901481628418, 'learning_rate': 5.4624713775623465e-06, 'epoch': 0.67}
17666
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
17667
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
17668
+ tensor(0.4353, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17669
+ {'train/tv_loss': 0.00014477868098765613, 'train/lm_loss': 2.8036060393787923e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.5355187058448792, 'train/uncertainty_loss': 0.043532878160476685, 'train/video_loss': 0.5802289247512817, 'train/total_loss': 0.5802569389343262}
17670
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17671
+ tensor(-0.0013, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0013, device='cuda:0', grad_fn=<MulBackward0>)
17672
+ {'train/tv_loss': None, 'train/lm_loss': 0.2846735239028931, 'train/info_loss': 0.25035104155540466, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00012556229485198857, 'train/video_loss': 0.2502254843711853, 'train/total_loss': 0.5348989963531494}
17673
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17674
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17675
+ tensor(-0.0010, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:1', grad_fn=<MulBackward0>)
17676
+ [Rank 1] Trainer log: {'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06}
17677
+ [Rank 0] Trainer log: {'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06}[Rank 2] Trainer log: {'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06}
17678
+ [Rank 3] Trainer log: {'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06}
17679
+
17680
+ {'loss': 0.2973, 'grad_norm': 6.888955593109131, 'learning_rate': 5.452967815752981e-06, 'epoch': 0.67}
17681
+ tensor(-0.0012, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:0', grad_fn=<MulBackward0>)
17682
+ {'train/tv_loss': None, 'train/lm_loss': 0.35250082015991213, 'train/info_loss': 0.12986838817596436, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00011850049486383796, 'train/video_loss': 0.12974989414215088, 'train/total_loss': 0.48225072026252747}
17683
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
17684
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17685
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
17686
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
17687
+ tensor(-0.0009, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:0', grad_fn=<MulBackward0>)
17688
+ {'train/tv_loss': None, 'train/lm_loss': 0.06482524275779725, 'train/info_loss': 0.19261880218982697, 'train/ref_loss': None, 'train/uncertainty_loss': -9.101710165850819e-05, 'train/video_loss': 0.19252778589725494, 'train/total_loss': 0.2573530375957489}
17689
+ tensor(-0.0009, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:1', grad_fn=<MulBackward0>)
17690
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17691
+ [Rank 3] Trainer log: {'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06}[Rank 0] Trainer log: {'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06}
17692
+ [Rank 1] Trainer log: {'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06}
17693
+ [Rank 2] Trainer log: {'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06}
17694
+
17695
+ {'loss': 0.2944, 'grad_norm': 2.9446370601654053, 'learning_rate': 5.4434694283094916e-06, 'epoch': 0.67}
17696
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
17697
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
17698
+ tensor(-0.0009, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0009, device='cuda:2', grad_fn=<MulBackward0>)
17699
+ tensor(0.0826, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17700
+ {'train/tv_loss': 0.00014374495949596167, 'train/lm_loss': 2.815525222104043e-05, 'train/info_loss': 1.9132725356030278e-05, 'train/ref_loss': 0.2506270706653595, 'train/uncertainty_loss': 0.008263303339481354, 'train/video_loss': 0.2600594460964203, 'train/total_loss': 0.26008760929107666}
17701
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
17702
+ {'train/tv_loss': None, 'train/lm_loss': 0.32692942619323734, 'train/info_loss': 0.17380265891551971, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010667567839846016, 'train/video_loss': 0.17369598150253296, 'train/total_loss': 0.5006253719329834}
17703
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
17704
+ tensor(0.4338, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17705
+ tensor(-0.0007, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17706
+ [Rank 3] Trainer log: {'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06}
17707
+ [Rank 2] Trainer log: {'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06}[Rank 1] Trainer log: {'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06}
17708
+ [Rank 0] Trainer log: {'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06}
17709
+
17710
+ {'loss': 0.4206, 'grad_norm': 6.2780914306640625, 'learning_rate': 5.433976226040713e-06, 'epoch': 0.67}
17711
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
17712
+ tensor(-0.0012, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:3', grad_fn=<MulBackward0>)
17713
+ tensor(0.0401, device='cuda:0', grad_fn=<AddBackward0>) tensor(0.1457, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17714
+ tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17715
+ {'train/tv_loss': 0.0001431380049325526, 'train/lm_loss': 2.1766431746073068e-05, 'train/info_loss': 1.561617318657227e-05, 'train/ref_loss': 0.21703669428825378, 'train/uncertainty_loss': 0.0040077798068523405, 'train/video_loss': 0.22220520675182343, 'train/total_loss': 0.22222697734832764}
17716
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
17717
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
17718
+ {'train/tv_loss': None, 'train/lm_loss': 0.19441068172454834, 'train/info_loss': 0.2764996588230133, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010609085438773036, 'train/video_loss': 0.2763935625553131, 'train/total_loss': 0.47080424427986145}
17719
+ tensor(0.0335, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17720
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
17721
+ [Rank 1] Trainer log: {'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06}[Rank 3] Trainer log: {'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06}[Rank 0] Trainer log: {'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06}
17722
+
17723
+ [Rank 2] Trainer log: {'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06}
17724
+
17725
+ {'loss': 0.3794, 'grad_norm': 5.537331581115723, 'learning_rate': 5.424488219749593e-06, 'epoch': 0.67}
17726
+ tensor(-0.0008, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0008, device='cuda:2', grad_fn=<MulBackward0>)
17727
+ tensor(0.1813, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17728
+ tensor(0.1105, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17729
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
17730
+ {'train/tv_loss': None, 'train/lm_loss': 0.10859153270721436, 'train/info_loss': 0.21142883598804474, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010824981145560742, 'train/video_loss': 0.21132057905197144, 'train/total_loss': 0.3199121057987213}
17731
+ tensor(-0.0012, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:1', grad_fn=<MulBackward0>)
17732
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
17733
+ tensor(0.2025, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17734
+ {'train/tv_loss': 0.00026534441858530047, 'train/lm_loss': 3.2517651561647654e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.34838536381721497, 'train/uncertainty_loss': 0.020245753228664398, 'train/video_loss': 0.3707745671272278, 'train/total_loss': 0.37080708146095276}
17735
+ tensor(-0.0012, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0012, device='cuda:2', grad_fn=<MulBackward0>)
17736
+ [Rank 3] Trainer log: {'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06}[Rank 1] Trainer log: {'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06}[Rank 2] Trainer log: {'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06}
17737
+
17738
+
17739
+ [Rank 0] Trainer log: {'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06}
17740
+ {'loss': 0.4171, 'grad_norm': 7.524171829223633, 'learning_rate': 5.415005420233141e-06, 'epoch': 0.67}
17741
+ tensor(-0.0015, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0015, device='cuda:3', grad_fn=<MulBackward0>)
17742
+ tensor(-0.0011, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:0', grad_fn=<MulBackward0>)
17743
+ {'train/tv_loss': None, 'train/lm_loss': 0.4450415134429932, 'train/info_loss': 0.18519927561283112, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00010915255406871439, 'train/video_loss': 0.18509012460708618, 'train/total_loss': 0.6301316022872925}
17744
+ tensor(0.2665, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17745
+ tensor(0.2499, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17746
+ tensor(-0.0014, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0014, device='cuda:0', grad_fn=<MulBackward0>)
17747
+ {'train/tv_loss': None, 'train/lm_loss': 0.31789367198944096, 'train/info_loss': 0.2461404800415039, 'train/ref_loss': None, 'train/uncertainty_loss': -0.00014492600457742812, 'train/video_loss': 0.24599555134773254, 'train/total_loss': 0.5638892650604248}
17748
+ tensor(-0.0007, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17749
+ tensor(0.4158, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0006, device='cuda:3', grad_fn=<MulBackward0>)
17750
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17751
+ [Rank 1] Trainer log: {'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06}
17752
+ [Rank 3] Trainer log: {'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06}
17753
+ [Rank 0] Trainer log: {'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06}
17754
+ [Rank 2] Trainer log: {'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06}
17755
+ {'loss': 0.4059, 'grad_norm': 16.058292388916016, 'learning_rate': 5.405527838282458e-06, 'epoch': 0.67}
17756
+ tensor(-0.0010, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0010, device='cuda:2', grad_fn=<MulBackward0>)
17757
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17758
+ tensor(0.0608, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17759
+ tensor(0.1152, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17760
+ {'train/tv_loss': 0.000334132113493979, 'train/lm_loss': 3.177867038175464e-05, 'train/info_loss': 2.0682384274550714e-05, 'train/ref_loss': 0.2817174196243286, 'train/uncertainty_loss': 0.011518295109272004, 'train/video_loss': 0.2959294617176056, 'train/total_loss': 0.2959612309932709}
17761
+ tensor(-0.0011, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:2', grad_fn=<MulBackward0>)
17762
+ tensor(0.3057, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17763
+ {'train/tv_loss': 0.0002599612809717655, 'train/lm_loss': 4.741583543363959e-05, 'train/info_loss': 2.3066464564180933e-05, 'train/ref_loss': 0.42135030031204224, 'train/uncertainty_loss': 0.03057071566581726, 'train/video_loss': 0.4540237784385681, 'train/total_loss': 0.45407119393348694}
17764
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)
17765
+ tensor(0.0280, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:3', grad_fn=<MulBackward0>)
17766
+ [Rank 1] Trainer log: {'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06}[Rank 2] Trainer log: {'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06}
17767
+
17768
+ [Rank 0] Trainer log: {'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06}[Rank 3] Trainer log: {'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06}
17769
+
17770
+ {'loss': 0.28, 'grad_norm': 6.182593822479248, 'learning_rate': 5.396055484682719e-06, 'epoch': 0.67}
17771
+ tensor(-0.0011, device='cuda:3', grad_fn=<AddBackward0>) tensor(-0.0011, device='cuda:3', grad_fn=<MulBackward0>)
17772
+ tensor(0.0697, device='cuda:0', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:0', grad_fn=<MulBackward0>)
17773
+ {'train/tv_loss': 0.0003155144164338708, 'train/lm_loss': 5.3374795243144035e-05, 'train/info_loss': 2.455650974297896e-05, 'train/ref_loss': 0.22853229939937592, 'train/uncertainty_loss': 0.006969699263572693, 'train/video_loss': 0.2380506843328476, 'train/total_loss': 0.23810406029224396}
17774
+ tensor(0.2178, device='cuda:2', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:2', grad_fn=<MulBackward0>)
17775
+ tensor(-0.0007, device='cuda:1', grad_fn=<AddBackward0>) tensor(-0.0007, device='cuda:1', grad_fn=<MulBackward0>)